Merge branch 'master' into next

Conflicts: fs/namei.c Manually merged per: diff --cc fs/namei.c index 734f2b5,bbc15c2..0000000 --- a/fs/namei.c +++ b/fs/namei.c @@@ -860,9 -848,8 +849,10 @@@ static int __link_path_walk(const char nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); if (err == -EAGAIN) - err = vfs_permission(nd, MAY_EXEC); + err = inode_permission(nd->path.dentry->d_inode, + MAY_EXEC); + if (!err) + err = ima_path_check(&nd->path, MAY_EXEC); if (err) break; @@@ -1525,14 -1506,9 +1509,14 @@@ int may_open(struct path *path, int acc flag &= ~O_TRUNC; } - error = vfs_permission(nd, acc_mode); + error = inode_permission(inode, acc_mode); if (error) return error; + - error = ima_path_check(&nd->path, ++ error = ima_path_check(path, + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) + return error; /* * An append-only file must be opened in append mode for writing. */ Signed-off-by: James Morris <jmorris@namei.org>
author: James Morris <jmorris@namei.org> 2009-02-05 19:01:45 -0500
committer: James Morris <jmorris@namei.org> 2009-02-05 19:01:45 -0500
commit: cb5629b10d64a8006622ce3a52bc887d91057d69 (patch)
tree: 7c06d8f30783115e3384721046258ce615b129c5 /fs
parent: 8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff)
parent: f01d1d546abb2f4028b5299092f529eefb01253a (diff)
512 files changed, 68059 insertions, 11282 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
new file mode 100644
index 000000000000..74e0723e90bc
--- /dev/null
+++ b/fs/9p/Kconfig
@@ -0,0 +1,10 @@
+config 9P_FS
+        tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
+        depends on INET && NET_9P && EXPERIMENTAL
+        help
+          If you say Y here, you will get experimental support for
+          Plan 9 resource sharing via the 9P2000 protocol.
+          See <http://v9fs.sf.net> for more information.
+          If unsure, say N.
diff --git a/fs/Kconfig b/fs/Kconfig
index 522469a7eca3..93945dd0b1ae 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -27,141 +27,8 @@ config FS_MBCACHE
        default y if EXT4_FS=y && EXT4_FS_XATTR
        default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
-config REISERFS_FS
+source "fs/reiserfs/Kconfig"
-        tristate "Reiserfs support"
+source "fs/jfs/Kconfig"
-        help
-          Stores not just filenames but the files themselves in a balanced
-          tree.  Uses journalling.
-          Balanced trees are more efficient than traditional file system
-          architectural foundations.
-          In general, ReiserFS is as fast as ext2, but is very efficient with
-          large directories and small files.  Additional patches are needed
-          for NFS and quotas, please see <http://www.namesys.com/> for links.
-          It is more easily extended to have features currently found in
-          database and keyword search systems than block allocation based file
-          systems are.  The next version will be so extended, and will support
-          plugins consistent with our motto ``It takes more than a license to
-          make source code open.''
-          Read <http://www.namesys.com/> to learn more about reiserfs.
-          Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
-          If you like it, you can pay us to add new features to it that you
-          need, buy a support contract, or pay us to port it to another OS.
-config REISERFS_CHECK
-        bool "Enable reiserfs debug mode"
-        depends on REISERFS_FS
-        help
-          If you set this to Y, then ReiserFS will perform every check it can
-          possibly imagine of its internal consistency throughout its
-          operation.  It will also go substantially slower.  More than once we
-          have forgotten that this was on, and then gone despondent over the
-          latest benchmarks.:-) Use of this option allows our team to go all
-          out in checking for consistency when debugging without fear of its
-          effect on end users.  If you are on the verge of sending in a bug
-          report, say Y and you might get a useful error message.  Almost
-          everyone should say N.
-config REISERFS_PROC_INFO
-        bool "Stats in /proc/fs/reiserfs"
-        depends on REISERFS_FS && PROC_FS
-        help
-          Create under /proc/fs/reiserfs a hierarchy of files, displaying
-          various ReiserFS statistics and internal data at the expense of
-          making your kernel or module slightly larger (+8 KB). This also
-          increases the amount of kernel memory required for each mount.
-          Almost everyone but ReiserFS developers and people fine-tuning
-          reiserfs or tracing problems should say N.
-config REISERFS_FS_XATTR
-        bool "ReiserFS extended attributes"
-        depends on REISERFS_FS
-        help
-          Extended attributes are name:value pairs associated with inodes by
-          the kernel or by users (see the attr(5) manual page, or visit
-          <http://acl.bestbits.at/> for details).
-          If unsure, say N.
-config REISERFS_FS_POSIX_ACL
-        bool "ReiserFS POSIX Access Control Lists"
-        depends on REISERFS_FS_XATTR
-        select FS_POSIX_ACL
-        help
-          Posix Access Control Lists (ACLs) support permissions for users and
-          groups beyond the owner/group/world scheme.
-          To learn more about Access Control Lists, visit the Posix ACLs for
-          Linux website <http://acl.bestbits.at/>.
-          If you don't know what Access Control Lists are, say N
-config REISERFS_FS_SECURITY
-        bool "ReiserFS Security Labels"
-        depends on REISERFS_FS_XATTR
-        help
-          Security labels support alternative access control models
-          implemented by security modules like SELinux.  This option
-          enables an extended attribute handler for file security
-          labels in the ReiserFS filesystem.
-          If you are not using a security module that requires using
-          extended attributes for file security labels, say N.
-config JFS_FS
-        tristate "JFS filesystem support"
-        select NLS
-        help
-          This is a port of IBM's Journaled Filesystem .  More information is
-          available in the file <file:Documentation/filesystems/jfs.txt>.
-          If you do not intend to use the JFS filesystem, say N.
-config JFS_POSIX_ACL
-        bool "JFS POSIX Access Control Lists"
-        depends on JFS_FS
-        select FS_POSIX_ACL
-        help
-          Posix Access Control Lists (ACLs) support permissions for users and
-          groups beyond the owner/group/world scheme.
-          To learn more about Access Control Lists, visit the Posix ACLs for
-          Linux website <http://acl.bestbits.at/>.
-          If you don't know what Access Control Lists are, say N
-config JFS_SECURITY
-        bool "JFS Security Labels"
-        depends on JFS_FS
-        help
-          Security labels support alternative access control models
-          implemented by security modules like SELinux.  This option
-          enables an extended attribute handler for file security
-          labels in the jfs filesystem.
-          If you are not using a security module that requires using
-          extended attributes for file security labels, say N.
-config JFS_DEBUG
-        bool "JFS debugging"
-        depends on JFS_FS
-        help
-          If you are experiencing any problems with the JFS filesystem, say
-          Y here.  This will result in additional debugging messages to be
-          written to the system log.  Under normal circumstances, this
-          results in very little overhead.
-config JFS_STATISTICS
-        bool "JFS statistics"
-        depends on JFS_FS
-        help
-          Enabling this option will cause statistics from the JFS file system
-          to be made available to the user in the /proc/fs/jfs/ directory.
 config FS_POSIX_ACL
 # Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
@@ -182,132 +49,12 @@ config FILE_LOCKING
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
+source "fs/ocfs2/Kconfig"
-config OCFS2_FS
+source "fs/btrfs/Kconfig"
-        tristate "OCFS2 file system support"
-        depends on NET && SYSFS
-        select CONFIGFS_FS
-        select JBD2
-        select CRC32
-        help
-          OCFS2 is a general purpose extent based shared disk cluster file
-          system with many similarities to ext3. It supports 64 bit inode
-          numbers, and has automatically extending metadata groups which may
-          also make it attractive for non-clustered use.
-          You'll want to install the ocfs2-tools package in order to at least
-          get "mount.ocfs2".
-          Project web page:    http://oss.oracle.com/projects/ocfs2
-          Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
-          OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
-          For more information on OCFS2, see the file
-          <file:Documentation/filesystems/ocfs2.txt>.
-config OCFS2_FS_O2CB
-        tristate "O2CB Kernelspace Clustering"
-        depends on OCFS2_FS
-        default y
-        help
-          OCFS2 includes a simple kernelspace clustering package, the OCFS2
-          Cluster Base.  It only requires a very small userspace component
-          to configure it. This comes with the standard ocfs2-tools package.
-          O2CB is limited to maintaining a cluster for OCFS2 file systems.
-          It cannot manage any other cluster applications.
-          It is always safe to say Y here, as the clustering method is
-          run-time selectable.
-config OCFS2_FS_USERSPACE_CLUSTER
-        tristate "OCFS2 Userspace Clustering"
-        depends on OCFS2_FS && DLM
-        default y
-        help
-          This option will allow OCFS2 to use userspace clustering services
-          in conjunction with the DLM in fs/dlm.  If you are using a
-          userspace cluster manager, say Y here.
-          It is safe to say Y, as the clustering method is run-time
-          selectable.
-config OCFS2_FS_STATS
-        bool "OCFS2 statistics"
-        depends on OCFS2_FS
-        default y
-        help
-          This option allows some fs statistics to be captured. Enabling
-          this option may increase the memory consumption.
-config OCFS2_DEBUG_MASKLOG
-        bool "OCFS2 logging support"
-        depends on OCFS2_FS
-        default y
-        help
-          The ocfs2 filesystem has an extensive logging system.  The system
-          allows selection of events to log via files in /sys/o2cb/logmask/.
-          This option will enlarge your kernel, but it allows debugging of
-          ocfs2 filesystem issues.
-config OCFS2_DEBUG_FS
-        bool "OCFS2 expensive checks"
-        depends on OCFS2_FS
-        default n
-        help
-          This option will enable expensive consistency checks. Enable
-          this option for debugging only as it is likely to decrease
-          performance of the filesystem.
-config OCFS2_COMPAT_JBD
-        bool "Use JBD for compatibility"
-        depends on OCFS2_FS
-        default n
-        select JBD
-        help
-          The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
-          is backwards compatible with JBD.  It is safe to say N here.
-          However, if you really want to use the original JBD, say Y here.
 endif # BLOCK
-config DNOTIFY
+source "fs/notify/Kconfig"
-        bool "Dnotify support"
-        default y
-        help
-          Dnotify is a directory-based per-fd file change notification system
-          that uses signals to communicate events to user-space.  There exist
-          superior alternatives, but some applications may still rely on
-          dnotify.
-          If unsure, say Y.
-config INOTIFY
-        bool "Inotify file change notification support"
-        default y
-        ---help---
-          Say Y here to enable inotify support.  Inotify is a file change
-          notification system and a replacement for dnotify.  Inotify fixes
-          numerous shortcomings in dnotify and introduces several new features
-          including multiple file events, one-shot support, and unmount
-          notification.
-          For more information, see <file:Documentation/filesystems/inotify.txt>
-          If unsure, say Y.
-config INOTIFY_USER
-        bool "Inotify support for userspace"
-        depends on INOTIFY
-        default y
-        ---help---
-          Say Y here to enable inotify support for userspace, including the
-          associated system calls.  Inotify allows monitoring of both files and
-          directories via a single open fd.  Events are read from the file
-          descriptor, which is also select()- and poll()-able.
-          For more information, see <file:Documentation/filesystems/inotify.txt>
-          If unsure, say Y.
 config QUOTA
        bool "Quota support"
@@ -340,6 +87,10 @@ config PRINT_QUOTA_WARNING
          Note that this behavior is currently deprecated and may go away in
          future. Please use notification via netlink socket instead.
+# Generic support for tree structured quota files. Seleted when needed.
+config QUOTA_TREE
+         tristate
 config QFMT_V1
        tristate "Old quota format support"
        depends on QUOTA
@@ -351,6 +102,7 @@ config QFMT_V1
 config QFMT_V2
        tristate "Quota format v2 support"
        depends on QUOTA
+        select QUOTA_TREE
        help
          This quota format allows using quotas with 32-bit UIDs/GIDs. If you
          need this functionality say Y here.
@@ -360,64 +112,9 @@ config QUOTACTL
        depends on XFS_QUOTA || QUOTA
        default y
-config AUTOFS_FS
+source "fs/autofs/Kconfig"
-        tristate "Kernel automounter support"
+source "fs/autofs4/Kconfig"
-        help
+source "fs/fuse/Kconfig"
-          The automounter is a tool to automatically mount remote file systems
-          on demand. This implementation is partially kernel-based to reduce
-          overhead in the already-mounted case; this is unlike the BSD
-          automounter (amd), which is a pure user space daemon.
-          To use the automounter you need the user-space tools from the autofs
-          package; you can find the location in <file:Documentation/Changes>.
-          You also want to answer Y to "NFS file system support", below.
-          If you want to use the newer version of the automounter with more
-          features, say N here and say Y to "Kernel automounter v4 support",
-          below.
-          To compile this support as a module, choose M here: the module will be
-          called autofs.
-          If you are not a part of a fairly large, distributed network, you
-          probably do not need an automounter, and can say N here.
-config AUTOFS4_FS
-        tristate "Kernel automounter version 4 support (also supports v3)"
-        help
-          The automounter is a tool to automatically mount remote file systems
-          on demand. This implementation is partially kernel-based to reduce
-          overhead in the already-mounted case; this is unlike the BSD
-          automounter (amd), which is a pure user space daemon.
-          To use the automounter you need the user-space tools from
-          <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
-          want to answer Y to "NFS file system support", below.
-          To compile this support as a module, choose M here: the module will be
-          called autofs4.  You will need to add "alias autofs autofs4" to your
-          modules configuration file.
-          If you are not a part of a fairly large, distributed network or
-          don't have a laptop which needs to dynamically reconfigure to the
-          local network, you probably do not need an automounter, and can say
-          N here.
-config FUSE_FS
-        tristate "FUSE (Filesystem in Userspace) support"
-        help
-          With FUSE it is possible to implement a fully functional filesystem
-          in a userspace program.
-          There's also companion library: libfuse.  This library along with
-          utilities is available from the FUSE homepage:
-          <http://fuse.sourceforge.net/>
-          See <file:Documentation/filesystems/fuse.txt> for more information.
-          See <file:Documentation/Changes> for needed library/utility version.
-          If you want to develop a userspace FS, or if you want to use
-          a filesystem based on FUSE, answer Y or M.
 config GENERIC_ACL
        bool
@@ -426,64 +123,8 @@ config GENERIC_ACL
 if BLOCK
 menu "CD-ROM/DVD Filesystems"
-config ISO9660_FS
+source "fs/isofs/Kconfig"
-        tristate "ISO 9660 CDROM file system support"
+source "fs/udf/Kconfig"
-        help
-          This is the standard file system used on CD-ROMs.  It was previously
-          known as "High Sierra File System" and is called "hsfs" on other
-          Unix systems.  The so-called Rock-Ridge extensions which allow for
-          long Unix filenames and symbolic links are also supported by this
-          driver.  If you have a CD-ROM drive and want to do more with it than
-          just listen to audio CDs and watch its LEDs, say Y (and read
-          <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
-          available from <http://www.tldp.org/docs.html#howto>), thereby
-          enlarging your kernel by about 27 KB; otherwise say N.
-          To compile this file system support as a module, choose M here: the
-          module will be called isofs.
-config JOLIET
-        bool "Microsoft Joliet CDROM extensions"
-        depends on ISO9660_FS
-        select NLS
-        help
-          Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
-          which allows for long filenames in unicode format (unicode is the
-          new 16 bit character code, successor to ASCII, which encodes the
-          characters of almost all languages of the world; see
-          <http://www.unicode.org/> for more information).  Say Y here if you
-          want to be able to read Joliet CD-ROMs under Linux.
-config ZISOFS
-        bool "Transparent decompression extension"
-        depends on ISO9660_FS
-        select ZLIB_INFLATE
-        help
-          This is a Linux-specific extension to RockRidge which lets you store
-          data in compressed form on a CD-ROM and have it transparently
-          decompressed when the CD-ROM is accessed.  See
-          <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
-          necessary to create such a filesystem.  Say Y here if you want to be
-          able to read such compressed CD-ROMs.
-config UDF_FS
-        tristate "UDF file system support"
-        select CRC_ITU_T
-        help
-          This is the new file system used on some CD-ROMs and DVDs. Say Y if
-          you intend to mount DVD discs or CDRW's written in packet mode, or
-          if written to by other UDF utilities, such as DirectCD.
-          Please read <file:Documentation/filesystems/udf.txt>.
-          To compile this file system support as a module, choose M here: the
-          module will be called udf.
-          If unsure, say N.
-config UDF_NLS
-        bool
-        default y
-        depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
 endmenu
 endif # BLOCK
@@ -491,182 +132,8 @@ endif # BLOCK
 if BLOCK
 menu "DOS/FAT/NT Filesystems"
-config FAT_FS
+source "fs/fat/Kconfig"
-        tristate
+source "fs/ntfs/Kconfig"
-        select NLS
-        help
-          If you want to use one of the FAT-based file systems (the MS-DOS and
-          VFAT (Windows 95) file systems), then you must say Y or M here
-          to include FAT support. You will then be able to mount partitions or
-          diskettes with FAT-based file systems and transparently access the
-          files on them, i.e. MSDOS files will look and behave just like all
-          other Unix files.
-          This FAT support is not a file system in itself, it only provides
-          the foundation for the other file systems. You will have to say Y or
-          M to at least one of "MSDOS fs support" or "VFAT fs support" in
-          order to make use of it.
-          Another way to read and write MSDOS floppies and hard drive
-          partitions from within Linux (but not transparently) is with the
-          mtools ("man mtools") program suite. You don't need to say Y here in
-          order to do that.
-          If you need to move large files on floppies between a DOS and a
-          Linux box, say Y here, mount the floppy under Linux with an MSDOS
-          file system and use GNU tar's M option. GNU tar is a program
-          available for Unix and DOS ("man tar" or "info tar").
-          The FAT support will enlarge your kernel by about 37 KB. If unsure,
-          say Y.
-          To compile this as a module, choose M here: the module will be called
-          fat.  Note that if you compile the FAT support as a module, you
-          cannot compile any of the FAT-based file systems into the kernel
-          -- they will have to be modules as well.
-config MSDOS_FS
-        tristate "MSDOS fs support"
-        select FAT_FS
-        help
-          This allows you to mount MSDOS partitions of your hard drive (unless
-          they are compressed; to access compressed MSDOS partitions under
-          Linux, you can either use the DOS emulator DOSEMU, described in the
-          DOSEMU-HOWTO, available from
-          <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
-          <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
-          intend to use dosemu with a non-compressed MSDOS partition, say Y
-          here) and MSDOS floppies. This means that file access becomes
-          transparent, i.e. the MSDOS files look and behave just like all
-          other Unix files.
-          If you have Windows 95 or Windows NT installed on your MSDOS
-          partitions, you should use the VFAT file system (say Y to "VFAT fs
-          support" below), or you will not be able to see the long filenames
-          generated by Windows 95 / Windows NT.
-          This option will enlarge your kernel by about 7 KB. If unsure,
-          answer Y. This will only work if you said Y to "DOS FAT fs support"
-          as well. To compile this as a module, choose M here: the module will
-          be called msdos.
-config VFAT_FS
-        tristate "VFAT (Windows-95) fs support"
-        select FAT_FS
-        help
-          This option provides support for normal Windows file systems with
-          long filenames.  That includes non-compressed FAT-based file systems
-          used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
-          programs from the mtools package.
-          The VFAT support enlarges your kernel by about 10 KB and it only
-          works if you said Y to the "DOS FAT fs support" above.  Please read
-          the file <file:Documentation/filesystems/vfat.txt> for details.  If
-          unsure, say Y.
-          To compile this as a module, choose M here: the module will be called
-          vfat.
-config FAT_DEFAULT_CODEPAGE
-        int "Default codepage for FAT"
-        depends on MSDOS_FS || VFAT_FS
-        default 437
-        help
-          This option should be set to the codepage of your FAT filesystems.
-          It can be overridden with the "codepage" mount option.
-          See <file:Documentation/filesystems/vfat.txt> for more information.
-config FAT_DEFAULT_IOCHARSET
-        string "Default iocharset for FAT"
-        depends on VFAT_FS
-        default "iso8859-1"
-        help
-          Set this to the default input/output character set you'd
-          like FAT to use. It should probably match the character set
-          that most of your FAT filesystems use, and can be overridden
-          with the "iocharset" mount option for FAT filesystems.
-          Note that "utf8" is not recommended for FAT filesystems.
-          If unsure, you shouldn't set "utf8" here.
-          See <file:Documentation/filesystems/vfat.txt> for more information.
-config NTFS_FS
-        tristate "NTFS file system support"
-        select NLS
-        help
-          NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
-          Saying Y or M here enables read support.  There is partial, but
-          safe, write support available.  For write support you must also
-          say Y to "NTFS write support" below.
-          There are also a number of user-space tools available, called
-          ntfsprogs.  These include ntfsundelete and ntfsresize, that work
-          without NTFS support enabled in the kernel.
-          This is a rewrite from scratch of Linux NTFS support and replaced
-          the old NTFS code starting with Linux 2.5.11.  A backport to
-          the Linux 2.4 kernel series is separately available as a patch
-          from the project web site.
-          For more information see <file:Documentation/filesystems/ntfs.txt>
-          and <http://www.linux-ntfs.org/>.
-          To compile this file system support as a module, choose M here: the
-          module will be called ntfs.
-          If you are not using Windows NT, 2000, XP or 2003 in addition to
-          Linux on your computer it is safe to say N.
-config NTFS_DEBUG
-        bool "NTFS debugging support"
-        depends on NTFS_FS
-        help
-          If you are experiencing any problems with the NTFS file system, say
-          Y here.  This will result in additional consistency checks to be
-          performed by the driver as well as additional debugging messages to
-          be written to the system log.  Note that debugging messages are
-          disabled by default.  To enable them, supply the option debug_msgs=1
-          at the kernel command line when booting the kernel or as an option
-          to insmod when loading the ntfs module.  Once the driver is active,
-          you can enable debugging messages by doing (as root):
-          echo 1 > /proc/sys/fs/ntfs-debug
-          Replacing the "1" with "0" would disable debug messages.
-          If you leave debugging messages disabled, this results in little
-          overhead, but enabling debug messages results in very significant
-          slowdown of the system.
-          When reporting bugs, please try to have available a full dump of
-          debugging messages while the misbehaviour was occurring.
-config NTFS_RW
-        bool "NTFS write support"
-        depends on NTFS_FS
-        help
-          This enables the partial, but safe, write support in the NTFS driver.
-          The only supported operation is overwriting existing files, without
-          changing the file length.  No file or directory creation, deletion or
-          renaming is possible.  Note only non-resident files can be written to
-          so you may find that some very small files (<500 bytes or so) cannot
-          be written to.
-          While we cannot guarantee that it will not damage any data, we have
-          so far not received a single report where the driver would have
-          damaged someones data so we assume it is perfectly safe to use.
-          Note:  While write support is safe in this version (a rewrite from
-          scratch of the NTFS support), it should be noted that the old NTFS
-          write support, included in Linux 2.5.10 and before (since 1997),
-          is not safe.
-          This is currently useful with TopologiLinux.  TopologiLinux is run
-          on top of any DOS/Microsoft Windows system without partitioning your
-          hard disk.  Unlike other Linux distributions TopologiLinux does not
-          need its own partition.  For more information see
-          <http://topologi-linux.sourceforge.net/>
-          It is perfectly safe to say N here.
 endmenu
 endif # BLOCK
@@ -674,30 +141,7 @@ endif # BLOCK
 menu "Pseudo filesystems"
 source "fs/proc/Kconfig"
+source "fs/sysfs/Kconfig"
-config SYSFS
-        bool "sysfs file system support" if EMBEDDED
-        default y
-        help
-        The sysfs filesystem is a virtual filesystem that the kernel uses to
-        export internal kernel objects, their attributes, and their
-        relationships to one another.
-        Users can use sysfs to ascertain useful information about the running
-        kernel, such as the devices the kernel has discovered on each bus and
-        which driver each is bound to. sysfs can also be used to tune devices
-        and other kernel subsystems.
-        Some system agents rely on the information in sysfs to operate.
-        /sbin/hotplug uses device and object attributes in sysfs to assist in
-        delegating policy decisions, like persistently naming devices.
-        sysfs is currently used by the block subsystem to mount the root
-        partition.  If sysfs is disabled you must specify the boot device on
-        the kernel boot command line via its major and minor numbers.  For
-        example, "root=03:01" for /dev/hda1.
-        Designers of embedded systems may wish to say N here to conserve space.
 config TMPFS
        bool "Virtual memory file system support (former shm fs)"
@@ -738,391 +182,48 @@ config HUGETLBFS
 config HUGETLB_PAGE
        def_bool HUGETLBFS
-config CONFIGFS_FS
+source "fs/configfs/Kconfig"
-        tristate "Userspace-driven configuration filesystem"
-        depends on SYSFS
-        help
-          configfs is a ram-based filesystem that provides the converse
-          of sysfs's functionality. Where sysfs is a filesystem-based
-          view of kernel objects, configfs is a filesystem-based manager
-          of kernel objects, or config_items.
-          Both sysfs and configfs can and should exist together on the
-          same system. One is not a replacement for the other.
 endmenu
-menu "Miscellaneous filesystems"
+menuconfig MISC_FILESYSTEMS
+        bool "Miscellaneous filesystems"
-config ADFS_FS
+        default y
-        tristate "ADFS file system support (EXPERIMENTAL)"
+        ---help---
-        depends on BLOCK && EXPERIMENTAL
+          Say Y here to get to see options for various miscellaneous
-        help
+          filesystems, such as filesystems that came from other
-          The Acorn Disc Filing System is the standard file system of the
+          operating systems.
-          RiscOS operating system which runs on Acorn's ARM-based Risc PC
-          systems and the Acorn Archimedes range of machines. If you say Y
-          here, Linux will be able to read from ADFS partitions on hard drives
-          and from ADFS-formatted floppy discs. If you also want to be able to
-          write to those devices, say Y to "ADFS write support" below.
-          The ADFS partition should be the first partition (i.e.,
-          /dev/[hs]d?1) on each of your drives. Please read the file
-          <file:Documentation/filesystems/adfs.txt> for further details.
-          To compile this code as a module, choose M here: the module will be
-          called adfs.
-          If unsure, say N.
-config ADFS_FS_RW
-        bool "ADFS write support (DANGEROUS)"
-        depends on ADFS_FS
-        help
-          If you say Y here, you will be able to write to ADFS partitions on
-          hard drives and ADFS-formatted floppy disks. This is experimental
-          codes, so if you're unsure, say N.
-config AFFS_FS
-        tristate "Amiga FFS file system support (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
-        help
-          The Fast File System (FFS) is the common file system used on hard
-          disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
-          if you want to be able to read and write files from and to an Amiga
-          FFS partition on your hard drive.  Amiga floppies however cannot be
-          read with this driver due to an incompatibility of the floppy
-          controller used in an Amiga and the standard floppy controller in
-          PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
-          and <file:fs/affs/Changes>.
-          With this driver you can also mount disk files used by Bernd
-          Schmidt's Un*X Amiga Emulator
-          (<http://www.freiburg.linux.de/~uae/>).
-          If you want to do this, you will also need to say Y or M to "Loop
-          device support", above.
-          To compile this file system support as a module, choose M here: the
-          module will be called affs.  If unsure, say N.
-config ECRYPT_FS
-        tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
-        depends on EXPERIMENTAL && KEYS && CRYPTO && NET
-        help
-          Encrypted filesystem that operates on the VFS layer.  See
-          <file:Documentation/filesystems/ecryptfs.txt> to learn more about
-          eCryptfs.  Userspace components are required and can be
-          obtained from <http://ecryptfs.sf.net>.
-          To compile this file system support as a module, choose M here: the
-          module will be called ecryptfs.
-config HFS_FS
-        tristate "Apple Macintosh file system support (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
-        select NLS
-        help
-          If you say Y here, you will be able to mount Macintosh-formatted
-          floppy disks and hard drive partitions with full read-write access.
-          Please read <file:Documentation/filesystems/hfs.txt> to learn about
-          the available mount options.
-          To compile this file system support as a module, choose M here: the
-          module will be called hfs.
-config HFSPLUS_FS
-        tristate "Apple Extended HFS file system support"
-        depends on BLOCK
-        select NLS
-        select NLS_UTF8
-        help
-          If you say Y here, you will be able to mount extended format
-          Macintosh-formatted hard drive partitions with full read-write access.
-          This file system is often called HFS+ and was introduced with
-          MacOS 8. It includes all Mac specific filesystem data such as
-          data forks and creator codes, but it also has several UNIX
-          style features such as file ownership and permissions.
-config BEFS_FS
-        tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
-        select NLS
-        help
-          The BeOS File System (BeFS) is the native file system of Be, Inc's
-          BeOS. Notable features include support for arbitrary attributes
-          on files and directories, and database-like indices on selected
-          attributes. (Also note that this driver doesn't make those features
-          available at this time). It is a 64 bit filesystem, so it supports
-          extremely large volumes and files.
-          If you use this filesystem, you should also say Y to at least one
-          of the NLS (native language support) options below.
-          If you don't know what this is about, say N.
-          To compile this as a module, choose M here: the module will be
-          called befs.
-config BEFS_DEBUG
-        bool "Debug BeFS"
-        depends on BEFS_FS
-        help
-          If you say Y here, you can use the 'debug' mount option to enable
-          debugging output from the driver.
-config BFS_FS
-        tristate "BFS file system support (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
-        help
-          Boot File System (BFS) is a file system used under SCO UnixWare to
-          allow the bootloader access to the kernel image and other important
-          files during the boot process.  It is usually mounted under /stand
-          and corresponds to the slice marked as "STAND" in the UnixWare
-          partition.  You should say Y if you want to read or write the files
-          on your /stand slice from within Linux.  You then also need to say Y
-          to "UnixWare slices support", below.  More information about the BFS
-          file system is contained in the file
-          <file:Documentation/filesystems/bfs.txt>.
-          If you don't know what this is about, say N.
-          To compile this as a module, choose M here: the module will be called
-          bfs.  Note that the file system of your root partition (the one
-          containing the directory /) cannot be compiled as a module.
-config EFS_FS
+          This option alone does not add any kernel code.
-        tristate "EFS file system support (read only) (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
-        help
-          EFS is an older file system used for non-ISO9660 CD-ROMs and hard
-          disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
-          uses the XFS file system for hard disk partitions however).
-          This implementation only offers read-only access. If you don't know
+          If you say N, all options in this submenu will be skipped and
-          what all this is about, it's safe to say N. For more information
+          disabled; if unsure, say Y here.
-          about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
-          To compile the EFS file system support as a module, choose M here: the
+if MISC_FILESYSTEMS
-          module will be called efs.
+source "fs/adfs/Kconfig"
+source "fs/affs/Kconfig"
+source "fs/ecryptfs/Kconfig"
+source "fs/hfs/Kconfig"
+source "fs/hfsplus/Kconfig"
+source "fs/befs/Kconfig"
+source "fs/bfs/Kconfig"
+source "fs/efs/Kconfig"
 source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
+source "fs/cramfs/Kconfig"
-config CRAMFS
+source "fs/squashfs/Kconfig"
-        tristate "Compressed ROM file system support (cramfs)"
+source "fs/freevxfs/Kconfig"
-        depends on BLOCK
+source "fs/minix/Kconfig"
-        select ZLIB_INFLATE
+source "fs/omfs/Kconfig"
-        help
+source "fs/hpfs/Kconfig"
-          Saying Y here includes support for CramFs (Compressed ROM File
+source "fs/qnx4/Kconfig"
-          System).  CramFs is designed to be a simple, small, and compressed
+source "fs/romfs/Kconfig"
-          file system for ROM based embedded systems.  CramFs is read-only,
+source "fs/sysv/Kconfig"
-          limited to 256MB file systems (with 16MB files), and doesn't support
+source "fs/ufs/Kconfig"
-          16/32 bits uid/gid, hard links and timestamps.
+endif # MISC_FILESYSTEMS
-          See <file:Documentation/filesystems/cramfs.txt> and
-          <file:fs/cramfs/README> for further information.
-          To compile this as a module, choose M here: the module will be called
-          cramfs.  Note that the root file system (the one containing the
-          directory /) cannot be compiled as a module.
-          If unsure, say N.
-config VXFS_FS
-        tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
-        depends on BLOCK
-        help
-          FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
-          file system format.  VERITAS VxFS(TM) is the standard file system
-          of SCO UnixWare (and possibly others) and optionally available
-          for Sunsoft Solaris, HP-UX and many other operating systems.
-          Currently only readonly access is supported.
-          NOTE: the file system type as used by mount(1), mount(2) and
-          fstab(5) is 'vxfs' as it describes the file system format, not
-          the actual driver.
-          To compile this as a module, choose M here: the module will be
-          called freevxfs.  If unsure, say N.
-config MINIX_FS
-        tristate "Minix file system support"
-        depends on BLOCK
-        help
-          Minix is a simple operating system used in many classes about OS's.
-          The minix file system (method to organize files on a hard disk
-          partition or a floppy disk) was the original file system for Linux,
-          but has been superseded by the second extended file system ext2fs.
-          You don't want to use the minix file system on your hard disk
-          because of certain built-in restrictions, but it is sometimes found
-          on older Linux floppy disks.  This option will enlarge your kernel
-          by about 28 KB. If unsure, say N.
-          To compile this file system support as a module, choose M here: the
-          module will be called minix.  Note that the file system of your root
-          partition (the one containing the directory /) cannot be compiled as
-          a module.
-config OMFS_FS
-        tristate "SonicBlue Optimized MPEG File System support"
-        depends on BLOCK
-        select CRC_ITU_T
-        help
-          This is the proprietary file system used by the Rio Karma music
-          player and ReplayTV DVR.  Despite the name, this filesystem is not
-          more efficient than a standard FS for MPEG files, in fact likely
-          the opposite is true.  Say Y if you have either of these devices
-          and wish to mount its disk.
-          To compile this file system support as a module, choose M here: the
-          module will be called omfs.  If unsure, say N.
-config HPFS_FS
-        tristate "OS/2 HPFS file system support"
-        depends on BLOCK
-        help
-          OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
-          is the file system used for organizing files on OS/2 hard disk
-          partitions. Say Y if you want to be able to read files from and
-          write files to an OS/2 HPFS partition on your hard drive. OS/2
-          floppies however are in regular MSDOS format, so you don't need this
-          option in order to be able to read them. Read
-          <file:Documentation/filesystems/hpfs.txt>.
-          To compile this file system support as a module, choose M here: the
-          module will be called hpfs.  If unsure, say N.
-config QNX4FS_FS
-        tristate "QNX4 file system support (read only)"
-        depends on BLOCK
-        help
-          This is the file system used by the real-time operating systems
-          QNX 4 and QNX 6 (the latter is also called QNX RTP).
-          Further information is available at <http://www.qnx.com/>.
-          Say Y if you intend to mount QNX hard disks or floppies.
-          Unless you say Y to "QNX4FS read-write support" below, you will
-          only be able to read these file systems.
-          To compile this file system support as a module, choose M here: the
-          module will be called qnx4.
-          If you don't know whether you need it, then you don't need it:
-          answer N.
-config QNX4FS_RW
-        bool "QNX4FS write support (DANGEROUS)"
-        depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
-        help
-          Say Y if you want to test write support for QNX4 file systems.
-          It's currently broken, so for now:
-          answer N.
-config ROMFS_FS
-        tristate "ROM file system support"
-        depends on BLOCK
-        ---help---
-          This is a very small read-only file system mainly intended for
-          initial ram disks of installation disks, but it could be used for
-          other read-only media as well.  Read
-          <file:Documentation/filesystems/romfs.txt> for details.
-          To compile this file system support as a module, choose M here: the
-          module will be called romfs.  Note that the file system of your
-          root partition (the one containing the directory /) cannot be a
-          module.
-          If you don't know whether you need it, then you don't need it:
-          answer N.
-config SYSV_FS
-        tristate "System V/Xenix/V7/Coherent file system support"
-        depends on BLOCK
-        help
-          SCO, Xenix and Coherent are commercial Unix systems for Intel
-          machines, and Version 7 was used on the DEC PDP-11. Saying Y
-          here would allow you to read from their floppies and hard disk
-          partitions.
-          If you have floppies or hard disk partitions like that, it is likely
-          that they contain binaries from those other Unix systems; in order
-          to run these binaries, you will want to install linux-abi which is
-          a set of kernel modules that lets you run SCO, Xenix, Wyse,
-          UnixWare, Dell Unix and System V programs under Linux.  It is
-          available via FTP (user: ftp) from
-          <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
-          NOTE: that will work only for binaries from Intel-based systems;
-          PDP ones will have to wait until somebody ports Linux to -11 ;-)
-          If you only intend to mount files from some other Unix over the
-          network using NFS, you don't need the System V file system support
-          (but you need NFS file system support obviously).
-          Note that this option is generally not needed for floppies, since a
-          good portable way to transport files and directories between unixes
-          (and even other operating systems) is given by the tar program ("man
-          tar" or preferably "info tar").  Note also that this option has
-          nothing whatsoever to do with the option "System V IPC". Read about
-          the System V file system in
-          <file:Documentation/filesystems/sysv-fs.txt>.
-          Saying Y here will enlarge your kernel by about 27 KB.
-          To compile this as a module, choose M here: the module will be called
-          sysv.
-          If you haven't heard about all of this before, it's safe to say N.
-config UFS_FS
-        tristate "UFS file system support (read only)"
-        depends on BLOCK
-        help
-          BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
-          OpenBSD and NeXTstep) use a file system called UFS. Some System V
-          Unixes can create and mount hard disk partitions and diskettes using
-          this file system as well. Saying Y here will allow you to read from
-          these partitions; if you also want to write to them, say Y to the
-          experimental "UFS file system write support", below. Please read the
-          file <file:Documentation/filesystems/ufs.txt> for more information.
-          The recently released UFS2 variant (used in FreeBSD 5.x) is
-          READ-ONLY supported.
-          Note that this option is generally not needed for floppies, since a
-          good portable way to transport files and directories between unixes
-          (and even other operating systems) is given by the tar program ("man
-          tar" or preferably "info tar").
-          When accessing NeXTstep files, you may need to convert them from the
-          NeXT character set to the Latin1 character set; use the program
-          recode ("info recode") for this purpose.
-          To compile the UFS file system support as a module, choose M here: the
-          module will be called ufs.
-          If you haven't heard about all of this before, it's safe to say N.
-config UFS_FS_WRITE
-        bool "UFS file system write support (DANGEROUS)"
-        depends on UFS_FS && EXPERIMENTAL
-        help
-          Say Y here if you want to try writing to UFS partitions. This is
-          experimental, so you should back up your UFS partitions beforehand.
-config UFS_DEBUG
-        bool "UFS debugging"
-        depends on UFS_FS
-        help
-          If you are experiencing any problems with the UFS filesystem, say
-          Y here.  This will result in _many_ additional debugging messages to be
-          written to the system log.
-endmenu
 menuconfig NETWORK_FILESYSTEMS
        bool "Network File Systems"
@@ -1140,173 +241,8 @@ menuconfig NETWORK_FILESYSTEMS
 if NETWORK_FILESYSTEMS
-config NFS_FS
+source "fs/nfs/Kconfig"
-        tristate "NFS client support"
+source "fs/nfsd/Kconfig"
-        depends on INET
-        select LOCKD
-        select SUNRPC
-        select NFS_ACL_SUPPORT if NFS_V3_ACL
-        help
-          Choose Y here if you want to access files residing on other
-          computers using Sun's Network File System protocol.  To compile
-          this file system support as a module, choose M here: the module
-          will be called nfs.
-          To mount file systems exported by NFS servers, you also need to
-          install the user space mount.nfs command which can be found in
-          the Linux nfs-utils package, available from http://linux-nfs.org/.
-          Information about using the mount command is available in the
-          mount(8) man page.  More detail about the Linux NFS client
-          implementation is available via the nfs(5) man page.
-          Below you can choose which versions of the NFS protocol are
-          available in the kernel to mount NFS servers.  Support for NFS
-          version 2 (RFC 1094) is always available when NFS_FS is selected.
-          To configure a system which mounts its root file system via NFS
-          at boot time, say Y here, select "Kernel level IP
-          autoconfiguration" in the NETWORK menu, and select "Root file
-          system on NFS" below.  You cannot compile this file system as a
-          module in this case.
-          If unsure, say N.
-config NFS_V3
-        bool "NFS client support for NFS version 3"
-        depends on NFS_FS
-        help
-          This option enables support for version 3 of the NFS protocol
-          (RFC 1813) in the kernel's NFS client.
-          If unsure, say Y.
-config NFS_V3_ACL
-        bool "NFS client support for the NFSv3 ACL protocol extension"
-        depends on NFS_V3
-        help
-          Some NFS servers support an auxiliary NFSv3 ACL protocol that
-          Sun added to Solaris but never became an official part of the
-          NFS version 3 protocol.  This protocol extension allows
-          applications on NFS clients to manipulate POSIX Access Control
-          Lists on files residing on NFS servers.  NFS servers enforce
-          ACLs on local files whether this protocol is available or not.
-          Choose Y here if your NFS server supports the Solaris NFSv3 ACL
-          protocol extension and you want your NFS client to allow
-          applications to access and modify ACLs on files on the server.
-          Most NFS servers don't support the Solaris NFSv3 ACL protocol
-          extension.  You can choose N here or specify the "noacl" mount
-          option to prevent your NFS client from trying to use the NFSv3
-          ACL protocol.
-          If unsure, say N.
-config NFS_V4
-        bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
-        depends on NFS_FS && EXPERIMENTAL
-        select RPCSEC_GSS_KRB5
-        help
-          This option enables support for version 4 of the NFS protocol
-          (RFC 3530) in the kernel's NFS client.
-          To mount NFS servers using NFSv4, you also need to install user
-          space programs which can be found in the Linux nfs-utils package,
-          available from http://linux-nfs.org/.
-          If unsure, say N.
-config ROOT_NFS
-        bool "Root file system on NFS"
-        depends on NFS_FS=y && IP_PNP
-        help
-          If you want your system to mount its root file system via NFS,
-          choose Y here.  This is common practice for managing systems
-          without local permanent storage.  For details, read
-          <file:Documentation/filesystems/nfsroot.txt>.
-          Most people say N here.
-config NFSD
-        tristate "NFS server support"
-        depends on INET
-        select LOCKD
-        select SUNRPC
-        select EXPORTFS
-        select NFS_ACL_SUPPORT if NFSD_V2_ACL
-        help
-          Choose Y here if you want to allow other computers to access
-          files residing on this system using Sun's Network File System
-          protocol.  To compile the NFS server support as a module,
-          choose M here: the module will be called nfsd.
-          You may choose to use a user-space NFS server instead, in which
-          case you can choose N here.
-          To export local file systems using NFS, you also need to install
-          user space programs which can be found in the Linux nfs-utils
-          package, available from http://linux-nfs.org/.  More detail about
-          the Linux NFS server implementation is available via the
-          exports(5) man page.
-          Below you can choose which versions of the NFS protocol are
-          available to clients mounting the NFS server on this system.
-          Support for NFS version 2 (RFC 1094) is always available when
-          CONFIG_NFSD is selected.
-          If unsure, say N.
-config NFSD_V2_ACL
-        bool
-        depends on NFSD
-config NFSD_V3
-        bool "NFS server support for NFS version 3"
-        depends on NFSD
-        help
-          This option enables support in your system's NFS server for
-          version 3 of the NFS protocol (RFC 1813).
-          If unsure, say Y.
-config NFSD_V3_ACL
-        bool "NFS server support for the NFSv3 ACL protocol extension"
-        depends on NFSD_V3
-        select NFSD_V2_ACL
-        help
-          Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
-          never became an official part of the NFS version 3 protocol.
-          This protocol extension allows applications on NFS clients to
-          manipulate POSIX Access Control Lists on files residing on NFS
-          servers.  NFS servers enforce POSIX ACLs on local files whether
-          this protocol is available or not.
-          This option enables support in your system's NFS server for the
-          NFSv3 ACL protocol extension allowing NFS clients to manipulate
-          POSIX ACLs on files exported by your system's NFS server.  NFS
-          clients which support the Solaris NFSv3 ACL protocol can then
-          access and modify ACLs on your NFS server.
-          To store ACLs on your NFS server, you also need to enable ACL-
-          related CONFIG options for your local file systems of choice.
-          If unsure, say N.
-config NFSD_V4
-        bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
-        depends on NFSD && PROC_FS && EXPERIMENTAL
-        select NFSD_V3
-        select FS_POSIX_ACL
-        select RPCSEC_GSS_KRB5
-        help
-          This option enables support in your system's NFS server for
-          version 4 of the NFS protocol (RFC 3530).
-          To export files using NFSv4, you need to install additional user
-          space programs which can be found in the Linux nfs-utils package,
-          available from http://linux-nfs.org/.
-          If unsure, say N.
 config LOCKD
        tristate
@@ -1328,221 +264,13 @@ config NFS_COMMON
        depends on NFSD || NFS_FS
        default y
-config SUNRPC
+source "net/sunrpc/Kconfig"
-        tristate
+source "fs/smbfs/Kconfig"
-config SUNRPC_GSS
-        tristate
-config SUNRPC_XPRT_RDMA
-        tristate
-        depends on SUNRPC && INFINIBAND && EXPERIMENTAL
-        default SUNRPC && INFINIBAND
-        help
-          This option enables an RPC client transport capability that
-          allows the NFS client to mount servers via an RDMA-enabled
-          transport.
-          To compile RPC client RDMA transport support as a module,
-          choose M here: the module will be called xprtrdma.
-          If unsure, say N.
-config SUNRPC_REGISTER_V4
-        bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
-        depends on SUNRPC && EXPERIMENTAL
-        default n
-        help
-          Sun added support for registering RPC services at an IPv6
-          address by creating two new versions of the rpcbind protocol
-          (RFC 1833).
-          This option enables support in the kernel RPC server for
-          registering kernel RPC services via version 4 of the rpcbind
-          protocol.  If you enable this option, you must run a portmapper
-          daemon that supports rpcbind protocol version 4.
-          Serving NFS over IPv6 from knfsd (the kernel's NFS server)
-          requires that you enable this option and use a portmapper that
-          supports rpcbind version 4.
-          If unsure, say N to get traditional behavior (register kernel
-          RPC services using only rpcbind version 2).  Distributions
-          using the legacy Linux portmapper daemon must say N here.
-config RPCSEC_GSS_KRB5
-        tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
-        depends on SUNRPC && EXPERIMENTAL
-        select SUNRPC_GSS
-        select CRYPTO
-        select CRYPTO_MD5
-        select CRYPTO_DES
-        select CRYPTO_CBC
-        help
-          Choose Y here to enable Secure RPC using the Kerberos version 5
-          GSS-API mechanism (RFC 1964).
-          Secure RPC calls with Kerberos require an auxiliary user-space
-          daemon which may be found in the Linux nfs-utils package
-          available from http://linux-nfs.org/.  In addition, user-space
-          Kerberos support should be installed.
-          If unsure, say N.
-config RPCSEC_GSS_SPKM3
-        tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
-        depends on SUNRPC && EXPERIMENTAL
-        select SUNRPC_GSS
-        select CRYPTO
-        select CRYPTO_MD5
-        select CRYPTO_DES
-        select CRYPTO_CAST5
-        select CRYPTO_CBC
-        help
-          Choose Y here to enable Secure RPC using the SPKM3 public key
-          GSS-API mechansim (RFC 2025).
-          Secure RPC calls with SPKM3 require an auxiliary userspace
-          daemon which may be found in the Linux nfs-utils package
-          available from http://linux-nfs.org/.
-          If unsure, say N.
-config SMB_FS
-        tristate "SMB file system support (OBSOLETE, please use CIFS)"
-        depends on INET
-        select NLS
-        help
-          SMB (Server Message Block) is the protocol Windows for Workgroups
-          (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
-          files and printers over local networks.  Saying Y here allows you to
-          mount their file systems (often called "shares" in this context) and
-          access them just like any other Unix directory.  Currently, this
-          works only if the Windows machines use TCP/IP as the underlying
-          transport protocol, and not NetBEUI.  For details, read
-          <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
-          available from <http://www.tldp.org/docs.html#howto>.
-          Note: if you just want your box to act as an SMB *server* and make
-          files and printing services available to Windows clients (which need
-          to have a TCP/IP stack), you don't need to say Y here; you can use
-          the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
-          for that.
-          General information about how to connect Linux, Windows machines and
-          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-          To compile the SMB support as a module, choose M here:
-          the module will be called smbfs.  Most people say N, however.
-config SMB_NLS_DEFAULT
-        bool "Use a default NLS"
-        depends on SMB_FS
-        help
-          Enabling this will make smbfs use nls translations by default. You
-          need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
-          settings and you need to give the default nls for the SMB server as
-          CONFIG_SMB_NLS_REMOTE.
-          The nls settings can be changed at mount time, if your smbmount
-          supports that, using the codepage and iocharset parameters.
-          smbmount from samba 2.2.0 or later supports this.
-config SMB_NLS_REMOTE
-        string "Default Remote NLS Option"
-        depends on SMB_NLS_DEFAULT
-        default "cp437"
-        help
-          This setting allows you to specify a default value for which
-          codepage the server uses. If this field is left blank no
-          translations will be done by default. The local codepage/charset
-          default to CONFIG_NLS_DEFAULT.
-          The nls settings can be changed at mount time, if your smbmount
-          supports that, using the codepage and iocharset parameters.
-          smbmount from samba 2.2.0 or later supports this.
 source "fs/cifs/Kconfig"
-config NCP_FS
-        tristate "NCP file system support (to mount NetWare volumes)"
-        depends on IPX!=n || INET
-        help
-          NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
-          used by Novell NetWare clients to talk to file servers.  It is to
-          IPX what NFS is to TCP/IP, if that helps.  Saying Y here allows you
-          to mount NetWare file server volumes and to access them just like
-          any other Unix directory.  For details, please read the file
-          <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
-          the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
-          You do not have to say Y here if you want your Linux box to act as a
-          file *server* for Novell NetWare clients.
-          General information about how to connect Linux, Windows machines and
-          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-          To compile this as a module, choose M here: the module will be called
-          ncpfs.  Say N unless you are connected to a Novell network.
 source "fs/ncpfs/Kconfig"
+source "fs/coda/Kconfig"
-config CODA_FS
+source "fs/afs/Kconfig"
-        tristate "Coda file system support (advanced network fs)"
+source "fs/9p/Kconfig"
-        depends on INET
-        help
-          Coda is an advanced network file system, similar to NFS in that it
-          enables you to mount file systems of a remote server and access them
-          with regular Unix commands as if they were sitting on your hard
-          disk.  Coda has several advantages over NFS: support for
-          disconnected operation (e.g. for laptops), read/write server
-          replication, security model for authentication and encryption,
-          persistent client caches and write back caching.
-          If you say Y here, your Linux box will be able to act as a Coda
-          *client*.  You will need user level code as well, both for the
-          client and server.  Servers are currently user level, i.e. they need
-          no kernel support.  Please read
-          <file:Documentation/filesystems/coda.txt> and check out the Coda
-          home page <http://www.coda.cs.cmu.edu/>.
-          To compile the coda client support as a module, choose M here: the
-          module will be called coda.
-config AFS_FS
-        tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
-        depends on INET && EXPERIMENTAL
-        select AF_RXRPC
-        help
-          If you say Y here, you will get an experimental Andrew File System
-          driver. It currently only supports unsecured read-only AFS access.
-          See <file:Documentation/filesystems/afs.txt> for more information.
-          If unsure, say N.
-config AFS_DEBUG
-        bool "AFS dynamic debugging"
-        depends on AFS_FS
-        help
-          Say Y here to make runtime controllable debugging messages appear.
-          See <file:Documentation/filesystems/afs.txt> for more information.
-          If unsure, say N.
-config 9P_FS
-        tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
-        depends on INET && NET_9P && EXPERIMENTAL
-        help
-          If you say Y here, you will get experimental support for
-          Plan 9 resource sharing via the 9P2000 protocol.
-          See <http://v9fs.sf.net> for more information.
-          If unsure, say N.
 endif # NETWORK_FILESYSTEMS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index ce9fb3fbfae4..bb4cc5b8abc8 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -43,7 +43,7 @@ config BINFMT_ELF_FDPIC
 config CORE_DUMP_DEFAULT_ELF_HEADERS
        bool "Write ELF core dumps with partial segments"
        default n
-        depends on BINFMT_ELF
+        depends on BINFMT_ELF && ELF_CORE
        help
          ELF core dump files describe each memory mapping of the crashed
          process, and can contain or omit the memory contents of each one.
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe6f0c4..38bc735c67ad 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -20,8 +20,7 @@ obj-y +=	no-block.o
 endif
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
-obj-$(CONFIG_INOTIFY)           += inotify.o
+obj-y                           += notify/
-obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
 obj-$(CONFIG_EPOLL)             += eventpoll.o
 obj-$(CONFIG_ANON_INODES)       += anon_inodes.o
 obj-$(CONFIG_SIGNALFD)          += signalfd.o
@@ -55,10 +54,9 @@ obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_QUOTA)             += dquot.o
 obj-$(CONFIG_QFMT_V1)           += quota_v1.o
 obj-$(CONFIG_QFMT_V2)           += quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)        += quota_tree.o
 obj-$(CONFIG_QUOTACTL)          += quota.o
-obj-$(CONFIG_DNOTIFY)           += dnotify.o
 obj-$(CONFIG_PROC_FS)           += proc/
 obj-y                           += partitions/
 obj-$(CONFIG_SYSFS)             += sysfs/
@@ -76,6 +74,7 @@ obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)              += jbd2/
 obj-$(CONFIG_EXT2_FS)           += ext2/
 obj-$(CONFIG_CRAMFS)            += cramfs/
+obj-$(CONFIG_SQUASHFS)          += squashfs/
 obj-y                           += ramfs/
 obj-$(CONFIG_HUGETLBFS)         += hugetlbfs/
 obj-$(CONFIG_CODA_FS)           += coda/
@@ -121,4 +120,5 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)             += hppfs/
 obj-$(CONFIG_DEBUG_FS)          += debugfs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
+obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
new file mode 100644
index 000000000000..e55182a74605
--- /dev/null
+++ b/fs/adfs/Kconfig
@@ -0,0 +1,27 @@
+config ADFS_FS
+        tristate "ADFS file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        help
+          The Acorn Disc Filing System is the standard file system of the
+          RiscOS operating system which runs on Acorn's ARM-based Risc PC
+          systems and the Acorn Archimedes range of machines. If you say Y
+          here, Linux will be able to read from ADFS partitions on hard drives
+          and from ADFS-formatted floppy discs. If you also want to be able to
+          write to those devices, say Y to "ADFS write support" below.
+          The ADFS partition should be the first partition (i.e.,
+          /dev/[hs]d?1) on each of your drives. Please read the file
+          <file:Documentation/filesystems/adfs.txt> for further details.
+          To compile this code as a module, choose M here: the module will be
+          called adfs.
+          If unsure, say N.
+config ADFS_FS_RW
+        bool "ADFS write support (DANGEROUS)"
+        depends on ADFS_FS
+        help
+          If you say Y here, you will be able to write to ADFS partitions on
+          hard drives and ADFS-formatted floppy disks. This is experimental
+          codes, so if you're unsure, say N.
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
new file mode 100644
index 000000000000..cfad9afb4762
--- /dev/null
+++ b/fs/affs/Kconfig
@@ -0,0 +1,21 @@
+config AFFS_FS
+        tristate "Amiga FFS file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        help
+          The Fast File System (FFS) is the common file system used on hard
+          disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
+          if you want to be able to read and write files from and to an Amiga
+          FFS partition on your hard drive.  Amiga floppies however cannot be
+          read with this driver due to an incompatibility of the floppy
+          controller used in an Amiga and the standard floppy controller in
+          PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
+          and <file:fs/affs/Changes>.
+          With this driver you can also mount disk files used by Bernd
+          Schmidt's Un*X Amiga Emulator
+          (<http://www.freiburg.linux.de/~uae/>).
+          If you want to do this, you will also need to say Y or M to "Loop
+          device support", above.
+          To compile this file system support as a module, choose M here: the
+          module will be called affs.  If unsure, say N.
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1377b1240b6e..9246cb4aa018 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
        }
        index = pos >> PAGE_CACHE_SHIFT;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac16..3c4ec7d864c4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
                goto bad_inode;
 #else
                inode->i_mode |= S_IFDIR;
-                inode->i_op = NULL;
+                /* ... and leave ->i_op and ->i_fop pointing to empty */
-                inode->i_fop = NULL;
                break;
 #endif
        case ST_LINKFILE:
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
new file mode 100644
index 000000000000..e7b522fe15e1
--- /dev/null
+++ b/fs/afs/Kconfig
@@ -0,0 +1,21 @@
+config AFS_FS
+        tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
+        depends on INET && EXPERIMENTAL
+        select AF_RXRPC
+        help
+          If you say Y here, you will get an experimental Andrew File System
+          driver. It currently only supports unsecured read-only AFS access.
+          See <file:Documentation/filesystems/afs.txt> for more information.
+          If unsure, say N.
+config AFS_DEBUG
+        bool "AFS dynamic debugging"
+        depends on AFS_FS
+        help
+          Say Y here to make runtime controllable debugging messages appear.
+          See <file:Documentation/filesystems/afs.txt> for more information.
+          If unsure, say N.
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d6b85dab35fc..3fb36d433621 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
        candidate->state = AFS_WBACK_PENDING;
        init_waitqueue_head(&candidate->waitq);
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                kfree(candidate);
                return -ENOMEM;
diff --git a/fs/aio.c b/fs/aio.c
index d6f89d3c15e8..8fa77e233944 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1270,7 +1270,7 @@ static void io_destroy(struct kioctx *ioctx)
 *      pointer is passed for ctxp.  Will fail with -ENOSYS if not
 *      implemented.
 */
-asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
+SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 {
        struct kioctx *ioctx = NULL;
        unsigned long ctx;
@@ -1308,7 +1308,7 @@ out:
 *      implemented.  May fail with -EFAULT if the context pointed to
 *      is invalid.
 */
-asmlinkage long sys_io_destroy(aio_context_t ctx)
+SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
        struct kioctx *ioctx = lookup_ioctx(ctx);
        if (likely(NULL != ioctx)) {
@@ -1662,8 +1662,8 @@ out_put_req:
 *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
 *      fail with -ENOSYS if not implemented.
 */
-asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr,
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
-                              struct iocb __user * __user *iocbpp)
+                struct iocb __user * __user *, iocbpp)
 {
        struct kioctx *ctx;
        long ret = 0;
@@ -1737,8 +1737,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
 *      invalid.  May fail with -EAGAIN if the iocb specified was not
 *      cancelled.  Will fail with -ENOSYS if not implemented.
 */
-asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
+SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
-                              struct io_event __user *result)
+                struct io_event __user *, result)
 {
        int (*cancel)(struct kiocb *iocb, struct io_event *res);
        struct kioctx *ctx;
@@ -1799,11 +1799,11 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
 *      will be updated if not NULL and the operation blocks.  Will fail
 *      with -ENOSYS if not implemented.
 */
-asmlinkage long sys_io_getevents(aio_context_t ctx_id,
+SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
-                                 long min_nr,
+                long, min_nr,
-                                 long nr,
+                long, nr,
-                                 struct io_event __user *events,
+                struct io_event __user *, events,
-                                 struct timespec __user *timeout)
+                struct timespec __user *, timeout)
 {
        struct kioctx *ioctx = lookup_ioctx(ctx_id);
        long ret = -EINVAL;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index c16d9be1b017..3bbdb9d02376 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
        if (IS_ERR(anon_inode_inode))
                return -ENODEV;
+        if (fops->owner && !try_module_get(fops->owner))
+                return -ENOENT;
        error = get_unused_fd_flags(flags);
        if (error < 0)
-                return error;
+                goto err_module;
        fd = error;
        /*
@@ -128,6 +131,8 @@ err_dput:
        dput(dentry);
 err_put_unused_fd:
        put_unused_fd(fd);
+err_module:
+        module_put(fops->owner);
        return error;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
new file mode 100644
index 000000000000..5f3bea90911e
--- /dev/null
+++ b/fs/autofs/Kconfig
@@ -0,0 +1,21 @@
+config AUTOFS_FS
+        tristate "Kernel automounter support"
+        help
+          The automounter is a tool to automatically mount remote file systems
+          on demand. This implementation is partially kernel-based to reduce
+          overhead in the already-mounted case; this is unlike the BSD
+          automounter (amd), which is a pure user space daemon.
+          To use the automounter you need the user-space tools from the autofs
+          package; you can find the location in <file:Documentation/Changes>.
+          You also want to answer Y to "NFS file system support", below.
+          If you want to use the newer version of the automounter with more
+          features, say N here and say Y to "Kernel automounter v4 support",
+          below.
+          To compile this support as a module, choose M here: the module will be
+          called autofs.
+          If you are not a part of a fairly large, distributed network, you
+          probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c60..e1734f2d6e26 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_nlink = 2;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_blocks = 0;
        if (ino == AUTOFS_ROOT_INO) {
                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
                inode->i_op = &autofs_root_inode_operations;
                inode->i_fop = &autofs_root_operations;
-                inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
                goto done;
        } 
        
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
new file mode 100644
index 000000000000..1204d6384d39
--- /dev/null
+++ b/fs/autofs4/Kconfig
@@ -0,0 +1,20 @@
+config AUTOFS4_FS
+        tristate "Kernel automounter version 4 support (also supports v3)"
+        help
+          The automounter is a tool to automatically mount remote file systems
+          on demand. This implementation is partially kernel-based to reduce
+          overhead in the already-mounted case; this is unlike the BSD
+          automounter (amd), which is a pure user space daemon.
+          To use the automounter you need the user-space tools from
+          <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
+          want to answer Y to "NFS file system support", below.
+          To compile this support as a module, choose M here: the module will be
+          called autofs4.  You will need to add "alias autofs autofs4" to your
+          modules configuration file.
+          If you are not a part of a fairly large, distributed network or
+          don't have a laptop which needs to dynamically reconfigure to the
+          local network, you probably do not need an automounter, and can say
+          N here.
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index e0f16da00e54..a76803108d06 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -25,8 +25,6 @@
 #define AUTOFS_DEV_IOCTL_IOC_FIRST      (AUTOFS_DEV_IOCTL_VERSION)
 #define AUTOFS_DEV_IOCTL_IOC_COUNT      (AUTOFS_IOC_COUNT - 11)
-#define AUTOFS_TYPE_TRIGGER     (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/time.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 63b7c7afe8df..025e105bffea 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
 /*
 * Check sanity of parameter control fields and if a path is present
- * check that it has a "/" and is terminated.
+ * check that it is terminated and contains at least one "/".
 */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
        }
        if (param->size > sizeof(*param)) {
-                err = check_name(param->path);
+                err = invalid_str(param->path,
+                                 (void *) ((size_t) param + param->size));
                if (err) {
-                        AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+                        AUTOFS_WARN(
-                                    cmd);
+                          "path string terminator missing for cmd(0x%08x)",
+                          cmd);
                        goto out;
                }
-                err = invalid_str(param->path,
+                err = check_name(param->path);
-                                 (void *) ((size_t) param + param->size));
                if (err) {
                        AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
                                    cmd);
@@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp,
                                     struct autofs_sb_info *sbi,
                                     struct autofs_dev_ioctl *param)
 {
-        param->arg1 = sbi->version;
+        param->protover.version = sbi->version;
        return 0;
 }
@@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
                                        struct autofs_sb_info *sbi,
                                        struct autofs_dev_ioctl *param)
 {
-        param->arg1 = sbi->sub_version;
+        param->protosubver.sub_version = sbi->sub_version;
        return 0;
 }
@@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
        int err, fd;
        /* param->path has already been checked */
-        if (!param->arg1)
+        if (!param->openmount.devid)
                return -EINVAL;
        param->ioctlfd = -1;
        path = param->path;
-        devid = param->arg1;
+        devid = param->openmount.devid;
        err = 0;
        fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
 {
        autofs_wqt_t token;
-        token = (autofs_wqt_t) param->arg1;
+        token = (autofs_wqt_t) param->ready.token;
        return autofs4_wait_release(sbi, token, 0);
 }
@@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp,
        autofs_wqt_t token;
        int status;
-        token = (autofs_wqt_t) param->arg1;
+        token = (autofs_wqt_t) param->fail.token;
-        status = param->arg2 ? param->arg2 : -ENOENT;
+        status = param->fail.status ? param->fail.status : -ENOENT;
        return autofs4_wait_release(sbi, token, status);
 }
@@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
        int pipefd;
        int err = 0;
-        if (param->arg1 == -1)
+        if (param->setpipefd.pipefd == -1)
                return -EINVAL;
-        pipefd = param->arg1;
+        pipefd = param->setpipefd.pipefd;
        mutex_lock(&sbi->wq_mutex);
        if (!sbi->catatonic) {
@@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
 {
        unsigned long timeout;
-        timeout = param->arg1;
+        timeout = param->timeout.timeout;
-        param->arg1 = sbi->exp_timeout / HZ;
+        param->timeout.timeout = sbi->exp_timeout / HZ;
        sbi->exp_timeout = timeout * HZ;
        return 0;
 }
@@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
        path = param->path;
        devid = sbi->sb->s_dev;
-        param->arg1 = param->arg2 = -1;
+        param->requester.uid = param->requester.gid = -1;
        /* Get nameidata of the parent directory */
        err = path_lookup(path, LOOKUP_PARENT, &nd);
@@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                err = 0;
                autofs4_expire_wait(nd.path.dentry);
                spin_lock(&sbi->fs_lock);
-                param->arg1 = ino->uid;
+                param->requester.uid = ino->uid;
-                param->arg2 = ino->gid;
+                param->requester.gid = ino->gid;
                spin_unlock(&sbi->fs_lock);
        }
@@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp,
        int err = -EAGAIN;
        int how;
-        how = param->arg1;
+        how = param->expire.how;
        mnt = fp->f_path.mnt;
-        if (sbi->type & AUTOFS_TYPE_TRIGGER)
+        if (autofs_type_trigger(sbi->type))
                dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
        else
                dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
@@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
                                      struct autofs_sb_info *sbi,
                                      struct autofs_dev_ioctl *param)
 {
-        param->arg1 = 0;
+        param->askumount.may_umount = 0;
        if (may_umount(fp->f_path.mnt))
-                param->arg1 = 1;
+                param->askumount.may_umount = 1;
        return 0;
 }
@@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
        struct nameidata nd;
        const char *path;
        unsigned int type;
+        unsigned int devid, magic;
        int err = -ENOENT;
        if (param->size <= sizeof(*param)) {
@@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
        }
        path = param->path;
-        type = param->arg1;
+        type = param->ismountpoint.in.type;
-        param->arg1 = 0;
+        param->ismountpoint.out.devid = devid = 0;
-        param->arg2 = 0;
+        param->ismountpoint.out.magic = magic = 0;
        if (!fp || param->ioctlfd == -1) {
-                if (type == AUTOFS_TYPE_ANY) {
+                if (autofs_type_any(type)) {
                        struct super_block *sb;
                        err = path_lookup(path, LOOKUP_FOLLOW, &nd);
@@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                                goto out;
                        sb = nd.path.dentry->d_sb;
-                        param->arg1 = new_encode_dev(sb->s_dev);
+                        devid = new_encode_dev(sb->s_dev);
                } else {
                        struct autofs_info *ino;
@@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                                goto out_release;
                        ino = autofs4_dentry_ino(nd.path.dentry);
-                        param->arg1 = autofs4_get_dev(ino->sbi);
+                        devid = autofs4_get_dev(ino->sbi);
                }
                err = 0;
                if (nd.path.dentry->d_inode &&
                    nd.path.mnt->mnt_root == nd.path.dentry) {
                        err = 1;
-                        param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic;
+                        magic = nd.path.dentry->d_inode->i_sb->s_magic;
                }
        } else {
-                dev_t devid = new_encode_dev(sbi->sb->s_dev);
+                dev_t dev = autofs4_get_dev(sbi);
                err = path_lookup(path, LOOKUP_PARENT, &nd);
                if (err)
                        goto out;
-                err = autofs_dev_ioctl_find_super(&nd, devid);
+                err = autofs_dev_ioctl_find_super(&nd, dev);
                if (err)
                        goto out_release;
-                param->arg1 = autofs4_get_dev(sbi);
+                devid = dev;
                err = have_submounts(nd.path.dentry);
                if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
                        if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
                                struct inode *inode = nd.path.dentry->d_inode;
-                                param->arg2 = inode->i_sb->s_magic;
+                                magic = inode->i_sb->s_magic;
                        }
                }
        }
+        param->ismountpoint.out.devid = devid;
+        param->ismountpoint.out.magic = magic;
 out_release:
        path_put(&nd.path);
 out:
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4b6fb3f628c0..e3bd50776f9e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
                struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
                /* This is an autofs submount, we can't expire it */
-                if (sbi->type == AUTOFS_TYPE_INDIRECT)
+                if (autofs_type_indirect(sbi->type))
                        goto done;
                /*
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
        if (arg && get_user(do_now, arg))
                return -EFAULT;
-        if (sbi->type & AUTOFS_TYPE_TRIGGER)
+        if (autofs_type_trigger(sbi->type))
                dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
        else
                dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef4..716e12b627b2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
        seq_printf(m, ",minproto=%d", sbi->min_proto);
        seq_printf(m, ",maxproto=%d", sbi->max_proto);
-        if (sbi->type & AUTOFS_TYPE_OFFSET)
+        if (autofs_type_offset(sbi->type))
                seq_printf(m, ",offset");
-        else if (sbi->type & AUTOFS_TYPE_DIRECT)
+        else if (autofs_type_direct(sbi->type))
                seq_printf(m, ",direct");
        else
                seq_printf(m, ",indirect");
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
                        *maxproto = option;
                        break;
                case Opt_indirect:
-                        *type = AUTOFS_TYPE_INDIRECT;
+                        set_autofs_type_indirect(type);
                        break;
                case Opt_direct:
-                        *type = AUTOFS_TYPE_DIRECT;
+                        set_autofs_type_direct(type);
                        break;
                case Opt_offset:
-                        *type = AUTOFS_TYPE_OFFSET;
+                        set_autofs_type_offset(type);
                        break;
                default:
                        return 1;
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        sbi->sb = s;
        sbi->version = 0;
        sbi->sub_version = 0;
-        sbi->type = AUTOFS_TYPE_INDIRECT;
+        set_autofs_type_indirect(&sbi->type);
        sbi->min_proto = 0;
        sbi->max_proto = 0;
        mutex_init(&sbi->wq_mutex);
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        }
        root_inode->i_fop = &autofs4_root_operations;
-        root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ?
+        root_inode->i_op = autofs_type_trigger(sbi->type) ?
                        &autofs4_direct_root_inode_operations :
                        &autofs4_indirect_root_inode_operations;
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
        if (sb->s_root) {
                inode->i_uid = sb->s_root->d_inode->i_uid;
                inode->i_gid = sb->s_root->d_inode->i_gid;
-        } else {
-                inode->i_uid = 0;
-                inode->i_gid = 0;
        }
-        inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        if (S_ISDIR(inf->mode)) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e02cc8ae5eb3..eeb246845909 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                 * is very similar for indirect mounts except only dentrys
                 * in the root of the autofs file system may be negative.
                 */
-                if (sbi->type & AUTOFS_TYPE_TRIGGER)
+                if (autofs_type_trigger(sbi->type))
                        return -ENOENT;
                else if (!IS_ROOT(dentry->d_parent))
                        return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                return -ENOMEM;
        /* If this is a direct mount request create a dummy name */
-        if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER)
+        if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
                qstr.len = sprintf(name, "%p", dentry);
        else {
                qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                                type = autofs_ptype_expire_multi;
                } else {
                        if (notify == NFY_MOUNT)
-                                type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+                                type = autofs_type_trigger(sbi->type) ?
                                        autofs_ptype_missing_direct :
                                         autofs_ptype_missing_indirect;
                        else
-                                type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+                                type = autofs_type_trigger(sbi->type) ?
                                        autofs_ptype_expire_direct :
                                        autofs_ptype_expire_indirect;
                }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f1538c03b1b..a05287a23f62 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags)
        return -EIO;
 }
-static int bad_file_dir_notify(struct file *file, unsigned long arg)
-{
-        return -EIO;
-}
 static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
        return -EIO;
@@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops =
        .sendpage       = bad_file_sendpage,
        .get_unmapped_area = bad_file_get_unmapped_area,
        .check_flags    = bad_file_check_flags,
-        .dir_notify     = bad_file_dir_notify,
        .flock          = bad_file_flock,
        .splice_write   = bad_file_splice_write,
        .splice_read    = bad_file_splice_read,
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
new file mode 100644
index 000000000000..7835d30f211f
--- /dev/null
+++ b/fs/befs/Kconfig
@@ -0,0 +1,26 @@
+config BEFS_FS
+        tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        select NLS
+        help
+          The BeOS File System (BeFS) is the native file system of Be, Inc's
+          BeOS. Notable features include support for arbitrary attributes
+          on files and directories, and database-like indices on selected
+          attributes. (Also note that this driver doesn't make those features
+          available at this time). It is a 64 bit filesystem, so it supports
+          extremely large volumes and files.
+          If you use this filesystem, you should also say Y to at least one
+          of the NLS (native language support) options below.
+          If you don't know what this is about, say N.
+          To compile this as a module, choose M here: the module will be
+          called befs.
+config BEFS_DEBUG
+        bool "Debug BeFS"
+        depends on BEFS_FS
+        help
+          If you say Y here, you can use the 'debug' mount option to enable
+          debugging output from the driver.
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b6dfee37c7b7..d06cb023ad02 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
                inode->i_size = 0;
                inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
                strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
-                        BEFS_SYMLINK_LEN);
+                        BEFS_SYMLINK_LEN - 1);
+                befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
        } else {
                int num_blks;
@@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
                        kfree(link);
                        befs_error(sb, "Failed to read entire long symlink");
                        link = ERR_PTR(-EIO);
+                } else {
+                        link[len - 1] = '\0';
                }
        } else {
                link = befs_ino->i_data.symlink;
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
new file mode 100644
index 000000000000..c2336c62024f
--- /dev/null
+++ b/fs/bfs/Kconfig
@@ -0,0 +1,19 @@
+config BFS_FS
+        tristate "BFS file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        help
+          Boot File System (BFS) is a file system used under SCO UnixWare to
+          allow the bootloader access to the kernel image and other important
+          files during the boot process.  It is usually mounted under /stand
+          and corresponds to the slice marked as "STAND" in the UnixWare
+          partition.  You should say Y if you want to read or write the files
+          on your /stand slice from within Linux.  You then also need to say Y
+          to "UnixWare slices support", below.  More information about the BFS
+          file system is contained in the file
+          <file:Documentation/filesystems/bfs.txt>.
+          If you don't know what this is about, say N.
+          To compile this as a module, choose M here: the module will be called
+          bfs.  Note that the file system of your root partition (the one
+          containing the directory /) cannot be compiled as a module.
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 0ed57b5ee012..cc4062d12ca2 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s)
 {
        struct bfs_sb_info *info = BFS_SB(s);
+        if (!info)
+                return;
        brelse(info->si_sbh);
        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        unsigned i, imap_len;
        struct bfs_sb_info *info;
        long ret = -EINVAL;
+        unsigned long i_sblock, i_eblock, i_eoff, s_size;
        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        s->s_magic = BFS_MAGIC;
        info->si_sbh = bh;
+        if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+                printf("Superblock is corrupted\n");
+                goto out;
+        }
        info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
                                        sizeof(struct bfs_inode)
                                        + BFS_ROOT_INO - 1;
@@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                        - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
        info->si_freei = 0;
        info->si_lf_eblk = 0;
+        /* can we read the last block? */
+        bh = sb_bread(s, info->si_blocks - 1);
+        if (!bh) {
+                printf("Last block not available: %lu\n", info->si_blocks - 1);
+                iput(inode);
+                ret = -EIO;
+                kfree(info->si_imap);
+                goto out;
+        }
+        brelse(bh);
        bh = NULL;
        for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
                struct bfs_inode *di;
@@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                di = (struct bfs_inode *)bh->b_data + off;
+                /* test if filesystem is not corrupted */
+                i_eoff = le32_to_cpu(di->i_eoffset);
+                i_sblock = le32_to_cpu(di->i_sblock);
+                i_eblock = le32_to_cpu(di->i_eblock);
+                s_size = le32_to_cpu(bfs_sb->s_end);
+                if (i_sblock > info->si_blocks ||
+                        i_eblock > info->si_blocks ||
+                        i_sblock > i_eblock ||
+                        i_eoff > s_size ||
+                        i_sblock * BFS_BSIZE > i_eoff) {
+                        printf("Inode 0x%08x corrupted\n", i);
+                        brelse(bh);
+                        s->s_root = NULL;
+                        kfree(info->si_imap);
+                        kfree(info);
+                        s->s_fs_info = NULL;
+                        return -EIO;
+                }
                if (!di->i_ino) {
                        info->si_freei++;
                        continue;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f1f3f4192a60..b639dcf7c778 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -95,92 +95,55 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
        int has_dumped = 0;
        unsigned long dump_start, dump_size;
        struct user dump;
-#if defined(__alpha__)
+#ifdef __alpha__
 #       define START_DATA(u)    (u.start_data)
-#elif defined(__arm__)
+#else
 #       define START_DATA(u)    ((u.u_tsize << PAGE_SHIFT) + u.start_code)
-#elif defined(__sparc__)
-#       define START_DATA(u)    (u.u_tsize)
-#elif defined(__i386__) || defined(__mc68000__) || defined(__arch_um__)
-#       define START_DATA(u)    (u.u_tsize << PAGE_SHIFT)
 #endif
-#ifdef __sparc__
-#       define START_STACK(u)   ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1))
-#else
 #       define START_STACK(u)   (u.start_stack)
-#endif
        fs = get_fs();
        set_fs(KERNEL_DS);
        has_dumped = 1;
        current->flags |= PF_DUMPCORE;
        strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
-#ifndef __sparc__
        dump.u_ar0 = offsetof(struct user, regs);
-#endif
        dump.signal = signr;
        aout_dump_thread(regs, &dump);
 /* If the size of the dump file exceeds the rlimit, then see what would happen
   if we wrote the stack, but not the data area.  */
-#ifdef __sparc__
-        if ((dump.u_dsize + dump.u_ssize) > limit)
-                dump.u_dsize = 0;
-#else
        if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit)
                dump.u_dsize = 0;
-#endif
 /* Make sure we have enough room to write the stack and data areas. */
-#ifdef __sparc__
-        if (dump.u_ssize > limit)
-                dump.u_ssize = 0;
-#else
        if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
                dump.u_ssize = 0;
-#endif
 /* make sure we actually have a data and stack area to dump */
        set_fs(USER_DS);
-#ifdef __sparc__
-        if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize))
-                dump.u_dsize = 0;
-        if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize))
-                dump.u_ssize = 0;
-#else
        if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
                dump.u_dsize = 0;
        if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
                dump.u_ssize = 0;
-#endif
        set_fs(KERNEL_DS);
 /* struct user */
        DUMP_WRITE(&dump,sizeof(dump));
 /* Now dump all of the user data.  Include malloced stuff as well */
-#ifndef __sparc__
        DUMP_SEEK(PAGE_SIZE);
-#endif
 /* now we start writing out the user space info */
        set_fs(USER_DS);
 /* Dump the data area */
        if (dump.u_dsize != 0) {
                dump_start = START_DATA(dump);
-#ifdef __sparc__
-                dump_size = dump.u_dsize;
-#else
                dump_size = dump.u_dsize << PAGE_SHIFT;
-#endif
                DUMP_WRITE(dump_start,dump_size);
        }
 /* Now prepare to dump the stack area */
        if (dump.u_ssize != 0) {
                dump_start = START_STACK(dump);
-#ifdef __sparc__
-                dump_size = dump.u_ssize;
-#else
                dump_size = dump.u_ssize << PAGE_SHIFT;
-#endif
                DUMP_WRITE(dump_start,dump_size);
        }
 /* Finally dump the task struct.  Not be used by gdb, but could be useful */
@@ -205,29 +168,24 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
        int envc = bprm->envc;
        sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
-#ifdef __sparc__
-        /* This imposes the proper stack alignment for a new process. */
-        sp = (void __user *) (((unsigned long) sp) & ~7);
-        if ((envc+argc+3)&1) --sp;
-#endif
 #ifdef __alpha__
 /* whee.. test-programs are so much fun. */
        put_user(0, --sp);
        put_user(0, --sp);
        if (bprm->loader) {
                put_user(0, --sp);
-                put_user(0x3eb, --sp);
+                put_user(1003, --sp);
                put_user(bprm->loader, --sp);
-                put_user(0x3ea, --sp);
+                put_user(1002, --sp);
        }
        put_user(bprm->exec, --sp);
-        put_user(0x3e9, --sp);
+        put_user(1001, --sp);
 #endif
        sp -= envc+1;
        envp = (char __user * __user *) sp;
        sp -= argc+1;
        argv = (char __user * __user *) sp;
-#if defined(__i386__) || defined(__mc68000__) || defined(__arm__) || defined(__arch_um__)
+#ifndef __alpha__
        put_user((unsigned long) envp,--sp);
        put_user((unsigned long) argv,--sp);
 #endif
@@ -300,13 +258,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                return retval;
        /* OK, This is the point of no return */
-#if defined(__alpha__)
+#ifdef __alpha__
        SET_AOUT_PERSONALITY(bprm, ex);
-#elif defined(__sparc__)
-        set_personality(PER_SUNOS);
-#if !defined(__sparc_v9__)
-        memcpy(&current->thread.core_exec, &ex, sizeof(struct exec));
-#endif
 #else
        set_personality(PER_LINUX);
 #endif
@@ -322,24 +275,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        install_exec_creds(bprm);
        current->flags &= ~PF_FORKNOEXEC;
-#ifdef __sparc__
-        if (N_MAGIC(ex) == NMAGIC) {
-                loff_t pos = fd_offset;
-                /* Fuck me plenty... */
-                /* <AOL></AOL> */
-                down_write(&current->mm->mmap_sem);     
-                error = do_brk(N_TXTADDR(ex), ex.a_text);
-                up_write(&current->mm->mmap_sem);
-                bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex),
-                          ex.a_text, &pos);
-                down_write(&current->mm->mmap_sem);
-                error = do_brk(N_DATADDR(ex), ex.a_data);
-                up_write(&current->mm->mmap_sem);
-                bprm->file->f_op->read(bprm->file, (char *) N_DATADDR(ex),
-                          ex.a_data, &pos);
-                goto beyond_if;
-        }
-#endif
        if (N_MAGIC(ex) == OMAGIC) {
                unsigned long text_addr, map_size;
@@ -347,7 +282,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                text_addr = N_TXTADDR(ex);
-#if defined(__alpha__) || defined(__sparc__)
+#ifdef __alpha__
                pos = fd_offset;
                map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
 #else
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        elf_addr_t __user *sp;
        elf_addr_t __user *u_platform;
        elf_addr_t __user *u_base_platform;
+        elf_addr_t __user *u_rand_bytes;
        const char *k_platform = ELF_PLATFORM;
        const char *k_base_platform = ELF_BASE_PLATFORM;
+        unsigned char k_rand_bytes[16];
        int items;
        elf_addr_t *elf_info;
        int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                        return -EFAULT;
        }
+        /*
+         * Generate 16 random bytes for userspace PRNG seeding.
+         */
+        get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+        u_rand_bytes = (elf_addr_t __user *)
+                       STACK_ALLOC(p, sizeof(k_rand_bytes));
+        if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+                return -EFAULT;
        /* Create the ELF interpreter info */
        elf_info = (elf_addr_t *)current->mm->saved_auxv;
        /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        NEW_AUX_ENT(AT_GID, cred->gid);
        NEW_AUX_ENT(AT_EGID, cred->egid);
        NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+        NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
        NEW_AUX_ENT(AT_EXECFN, bprm->exec);
        if (k_platform) {
                NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aa5b43205e37..f3e72c5c19f5 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
        struct elf_fdpic_params exec_params, interp_params;
        struct elf_phdr *phdr;
        unsigned long stack_size, entryaddr;
-#ifndef CONFIG_MMU
-        unsigned long fullsize;
-#endif
 #ifdef ELF_FDPIC_PLAT_INIT
        unsigned long dynaddr;
 #endif
@@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
                goto error_kill;
        }
-        /* expand the stack mapping to use up the entire allocation granule */
-        fullsize = kobjsize((char *) current->mm->start_brk);
-        if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
-                                    fullsize, 0, 0)))
-                stack_size = fullsize;
        up_write(&current->mm->mmap_sem);
        current->mm->brk = current->mm->start_brk;
@@ -1567,11 +1559,9 @@ end_coredump:
 static int elf_fdpic_dump_segments(struct file *file, size_t *size,
                           unsigned long *limit, unsigned long mm_flags)
 {
-        struct vm_list_struct *vml;
+        struct vm_area_struct *vma;
-        for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
-        struct vm_area_struct *vma = vml->vma;
+        for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
                if (!maydump(vma, mm_flags))
                        continue;
@@ -1617,9 +1607,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        elf_fpxregset_t *xfpu = NULL;
 #endif
        int thread_status_size = 0;
-#ifndef CONFIG_MMU
-        struct vm_list_struct *vml;
-#endif
        elf_addr_t *auxv;
        unsigned long mm_flags;
@@ -1685,13 +1672,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        fill_prstatus(prstatus, current, signr);
        elf_core_copy_regs(&prstatus->pr_reg, regs);
-#ifdef CONFIG_MMU
        segs = current->mm->map_count;
-#else
-        segs = 0;
-        for (vml = current->mm->context.vmlist; vml; vml = vml->next)
-            segs++;
-#endif
 #ifdef ELF_CORE_EXTRA_PHDRS
        segs += ELF_CORE_EXTRA_PHDRS;
 #endif
@@ -1766,20 +1747,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        mm_flags = current->mm->flags;
        /* write program headers for segments dump */
-        for (
+        for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
-#ifdef CONFIG_MMU
-                vma = current->mm->mmap; vma; vma = vma->vm_next
-#else
-                        vml = current->mm->context.vmlist; vml; vml = vml->next
-#endif
-             ) {
                struct elf_phdr phdr;
                size_t sz;
-#ifndef CONFIG_MMU
-                vma = vml->vma;
-#endif
                sz = vma->vm_end - vma->vm_start;
                phdr.p_type = PT_LOAD;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7bbd5c6b3725..5cebf0b37798 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm,
        unsigned long textpos = 0, datapos = 0, result;
        unsigned long realdatastart = 0;
        unsigned long text_len, data_len, bss_len, stack_len, flags;
-        unsigned long len, reallen, memp = 0;
+        unsigned long len, memp = 0;
-        unsigned long extra, rlim;
+        unsigned long memp_size, extra, rlim;
        unsigned long *reloc = 0, *rp;
        struct inode *inode;
        int i, rev, relocs = 0;
@@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm,
                }
                len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+                len = PAGE_ALIGN(len);
                down_write(&current->mm->mmap_sem);
                realdatastart = do_mmap(0, 0, len,
                        PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
-                /* Remap to use all availabe slack region space */
-                if (realdatastart && (realdatastart < (unsigned long)-4096)) {
-                        reallen = kobjsize((void *)realdatastart);
-                        if (reallen > len) {
-                                realdatastart = do_mremap(realdatastart, len,
-                                        reallen, MREMAP_FIXED, realdatastart);
-                        }
-                }
                up_write(&current->mm->mmap_sem);
                if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
@@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm,
                reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
                memp = realdatastart;
+                memp_size = len;
        } else {
                len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+                len = PAGE_ALIGN(len);
                down_write(&current->mm->mmap_sem);
                textpos = do_mmap(0, 0, len,
                        PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
-                /* Remap to use all availabe slack region space */
-                if (textpos && (textpos < (unsigned long) -4096)) {
-                        reallen = kobjsize((void *)textpos);
-                        if (reallen > len) {
-                                textpos = do_mremap(textpos, len, reallen,
-                                        MREMAP_FIXED, textpos);
-                        }
-                }
                up_write(&current->mm->mmap_sem);
                if (!textpos  || textpos >= (unsigned long) -4096) {
@@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) +
                                MAX_SHARED_LIBS * sizeof(unsigned long));
                memp = textpos;
+                memp_size = len;
 #ifdef CONFIG_BINFMT_ZFLAT
                /*
                 * load it all in and treat it like a RAM load from now on
@@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm,
                 * set up the brk stuff, uses any slack left in data/bss/stack
                 * allocation.  We put the brk after the bss (between the bss
                 * and stack) like other platforms.
+                 * Userspace code relies on the stack pointer starting out at
+                 * an address right at the end of a page.
                 */
                current->mm->start_brk = datapos + data_len + bss_len;
                current->mm->brk = (current->mm->start_brk + 3) & ~3;
-                current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len;
+                current->mm->context.end_brk = memp + memp_size - stack_len;
        }
        if (flags & FLAT_FLAG_KTRACE)
@@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm,
        /* zero the BSS,  BRK and stack areas */
        memset((void*)(datapos + data_len), 0, bss_len + 
-                        (memp + kobjsize((void *) memp) - stack_len -   /* end brk */
+                        (memp + memp_size - stack_len -         /* end brk */
-                        libinfo->lib_list[id].start_brk) +              /* start brk */
+                        libinfo->lib_list[id].start_brk) +      /* start brk */
                        stack_len);
        return 0;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b3..c4e83537ead7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
        if (inode) {
                inode->i_mode = mode;
-                inode->i_uid = 0;
-                inode->i_gid = 0;
-                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime =
                        current_fs_time(inode->i_sb);
        }
@@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-        char *s = enabled ? "enabled" : "disabled";
+        char *s = enabled ? "enabled\n" : "disabled\n";
        return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 77ebc3c263d6..549b0144da11 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -140,7 +140,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
        iv = bip_vec_idx(bip, bip->bip_vcnt);
        BUG_ON(iv == NULL);
-        BUG_ON(iv->bv_page != NULL);
        iv->bv_page = page;
        iv->bv_len = len;
@@ -465,7 +464,7 @@ static int bio_integrity_verify(struct bio *bio)
                if (ret) {
                        kunmap_atomic(kaddr, KM_USER0);
-                        break;
+                        return ret;
                }
                sectors = bv->bv_len / bi->sector_size;
@@ -493,18 +492,13 @@ static void bio_integrity_verify_fn(struct work_struct *work)
        struct bio_integrity_payload *bip =
                container_of(work, struct bio_integrity_payload, bip_work);
        struct bio *bio = bip->bip_bio;
-        int error = bip->bip_error;
+        int error;
-        if (bio_integrity_verify(bio)) {
+        error = bio_integrity_verify(bio);
-                clear_bit(BIO_UPTODATE, &bio->bi_flags);
-                error = -EIO;
-        }
        /* Restore original bio completion handler */
        bio->bi_end_io = bip->bip_end_io;
+        bio_endio(bio, error);
-        if (bio->bi_end_io)
-                bio->bi_end_io(bio, error);
 }
 /**
@@ -525,7 +519,17 @@ void bio_integrity_endio(struct bio *bio, int error)
        BUG_ON(bip->bip_bio != bio);
-        bip->bip_error = error;
+        /* In case of an I/O error there is no point in verifying the
+         * integrity metadata.  Restore original bio end_io handler
+         * and run it.
+         */
+        if (error) {
+                bio->bi_end_io = bip->bip_end_io;
+                bio_endio(bio, error);
+                return;
+        }
        INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
        queue_work(kintegrityd_wq, &bip->bip_work);
 }
diff --git a/fs/bio.c b/fs/bio.c
index 711cee103602..062299acbccd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        int i, ret;
        int nr_pages = 0;
        unsigned int len = 0;
+        unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
        for (i = 0; i < iov_count; i++) {
                unsigned long uaddr;
@@ -814,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        bio->bi_rw |= (!write_to_vm << BIO_RW);
        ret = 0;
-        i = 0;
+        if (map_data) {
+                nr_pages = 1 << map_data->page_order;
+                i = map_data->offset / PAGE_SIZE;
+        }
        while (len) {
-                unsigned int bytes;
+                unsigned int bytes = PAGE_SIZE;
-                if (map_data)
+                bytes -= offset;
-                        bytes = 1U << (PAGE_SHIFT + map_data->page_order);
-                else
-                        bytes = PAGE_SIZE;
                if (bytes > len)
                        bytes = len;
                if (map_data) {
-                        if (i == map_data->nr_entries) {
+                        if (i == map_data->nr_entries * nr_pages) {
                                ret = -ENOMEM;
                                break;
                        }
-                        page = map_data->pages[i++];
-                } else
+                        page = map_data->pages[i / nr_pages];
+                        page += (i % nr_pages);
+                        i++;
+                } else {
                        page = alloc_page(q->bounce_gfp | gfp_mask);
-                if (!page) {
+                        if (!page) {
-                        ret = -ENOMEM;
+                                ret = -ENOMEM;
-                        break;
+                                break;
+                        }
                }
-                if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+                if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
                        break;
                len -= bytes;
+                offset = 0;
        }
        if (ret)
@@ -851,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        /*
         * success
         */
-        if (!write_to_vm) {
+        if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
                if (ret)
                        goto cleanup;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99e0ae1a4c78..b3c1efff5e1d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -285,6 +285,8 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&bdev->bd_holder_list);
 #endif
        inode_init_once(&ei->vfs_inode);
+        /* Initialize mutex for freeze. */
+        mutex_init(&bdev->bd_fsfreeze_mutex);
 }
 static inline void __bd_forget(struct inode *inode)
@@ -326,12 +328,13 @@ static struct file_system_type bd_type = {
        .kill_sb        = kill_anon_super,
 };
-static struct vfsmount *bd_mnt __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
-struct super_block *blockdev_superblock;
 void __init bdev_cache_init(void)
 {
        int err;
+        struct vfsmount *bd_mnt;
        bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                        0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_MEM_SPREAD|SLAB_PANIC),
@@ -373,7 +376,7 @@ struct block_device *bdget(dev_t dev)
        struct block_device *bdev;
        struct inode *inode;
-        inode = iget5_locked(bd_mnt->mnt_sb, hash(dev),
+        inode = iget5_locked(blockdev_superblock, hash(dev),
                        bdev_test, bdev_set, &dev);
        if (!inode)
@@ -463,7 +466,7 @@ void bd_forget(struct inode *inode)
        spin_lock(&bdev_lock);
        if (inode->i_bdev) {
-                if (inode->i_sb != blockdev_superblock)
+                if (!sb_is_blkdev_sb(inode->i_sb))
                        bdev = inode->i_bdev;
                __bd_forget(inode);
        }
@@ -1004,6 +1007,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        }
        lock_kernel();
+ restart:
        ret = -ENXIO;
        disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1024,6 +1028,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (disk->fops->open) {
                                ret = disk->fops->open(bdev, mode);
+                                if (ret == -ERESTARTSYS) {
+                                        /* Lost a race with 'disk' being
+                                         * deleted, try again.
+                                         * See md.c
+                                         */
+                                        disk_put_part(bdev->bd_part);
+                                        bdev->bd_part = NULL;
+                                        module_put(disk->fops->owner);
+                                        put_disk(disk);
+                                        bdev->bd_disk = NULL;
+                                        mutex_unlock(&bdev->bd_mutex);
+                                        goto restart;
+                                }
                                if (ret)
                                        goto out_clear;
                        }
@@ -1219,6 +1236,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        return blkdev_ioctl(bdev, mode, cmd, arg);
 }
+/*
+ * Try to release a page associated with block device when the system
+ * is under memory pressure.
+ */
+static int blkdev_releasepage(struct page *page, gfp_t wait)
+{
+        struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
+        if (super && super->s_op->bdev_try_to_free_page)
+                return super->s_op->bdev_try_to_free_page(super, page, wait);
+        return try_to_free_buffers(page);
+}
 static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .writepage      = blkdev_writepage,
@@ -1226,6 +1257,7 @@ static const struct address_space_operations def_blk_aops = {
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
        .writepages     = generic_writepages,
+        .releasepage    = blkdev_releasepage,
        .direct_IO      = blkdev_direct_IO,
 };
@@ -1261,7 +1293,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 /**
 * lookup_bdev  - lookup a struct block_device by name
- * @path:       special file representing the block device
+ * @pathname:   special file representing the block device
 *
 * Get a reference to the blockdevice at @pathname in the current
 * namespace if possible and return it.  Return ERR_PTR(error)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 000000000000..f8fcf999ea1b
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,18 @@
+config BTRFS_FS
+        tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+        depends on EXPERIMENTAL
+        select LIBCRC32C
+        select ZLIB_INFLATE
+        select ZLIB_DEFLATE
+        help
+          Btrfs is a new filesystem with extents, writable snapshotting,
+          support for multiple devices and many more features.
+          Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+          FINALIZED.  You should say N here unless you are interested in
+          testing Btrfs with non-critical data.
+          To compile this file system support as a module, choose M here. The
+          module will be called btrfs.
+          If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..d2cf5a54a4b8
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
+ifneq ($(KERNELRELEASE),)
+# kbuild part of makefile
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
+btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+           file-item.o inode-item.o inode-map.o disk-io.o \
+           transaction.o inode.o file.o tree-defrag.o \
+           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+           compression.o
+else
+# Normal Makefile
+KERNELDIR := /lib/modules/`uname -r`/build
+all:
+        $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
+modules_install:
+        $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
+clean:
+        $(MAKE) -C $(KERNELDIR) M=`pwd` clean
+endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..1d53b62dbba5
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "xattr.h"
+#ifdef CONFIG_FS_POSIX_ACL
+static void btrfs_update_cached_acl(struct inode *inode,
+                                    struct posix_acl **p_acl,
+                                    struct posix_acl *acl)
+{
+        spin_lock(&inode->i_lock);
+        if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
+                posix_acl_release(*p_acl);
+        *p_acl = posix_acl_dup(acl);
+        spin_unlock(&inode->i_lock);
+}
+static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+        int size;
+        const char *name;
+        char *value = NULL;
+        struct posix_acl *acl = NULL, **p_acl;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                p_acl = &BTRFS_I(inode)->i_acl;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = POSIX_ACL_XATTR_DEFAULT;
+                p_acl = &BTRFS_I(inode)->i_default_acl;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        spin_lock(&inode->i_lock);
+        if (*p_acl != BTRFS_ACL_NOT_CACHED)
+                acl = posix_acl_dup(*p_acl);
+        spin_unlock(&inode->i_lock);
+        if (acl)
+                return acl;
+        size = __btrfs_getxattr(inode, name, "", 0);
+        if (size > 0) {
+                value = kzalloc(size, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                size = __btrfs_getxattr(inode, name, value, size);
+                if (size > 0) {
+                        acl = posix_acl_from_xattr(value, size);
+                        btrfs_update_cached_acl(inode, p_acl, acl);
+                }
+                kfree(value);
+        } else if (size == -ENOENT) {
+                acl = NULL;
+                btrfs_update_cached_acl(inode, p_acl, acl);
+        }
+        return acl;
+}
+static int btrfs_xattr_get_acl(struct inode *inode, int type,
+                               void *value, size_t size)
+{
+        struct posix_acl *acl;
+        int ret = 0;
+        acl = btrfs_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        ret = posix_acl_to_xattr(acl, value, size);
+        posix_acl_release(acl);
+        return ret;
+}
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+        int ret, size = 0;
+        const char *name;
+        struct posix_acl **p_acl;
+        char *value = NULL;
+        mode_t mode;
+        if (acl) {
+                ret = posix_acl_valid(acl);
+                if (ret < 0)
+                        return ret;
+                ret = 0;
+        }
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                mode = inode->i_mode;
+                ret = posix_acl_equiv_mode(acl, &mode);
+                if (ret < 0)
+                        return ret;
+                ret = 0;
+                inode->i_mode = mode;
+                name = POSIX_ACL_XATTR_ACCESS;
+                p_acl = &BTRFS_I(inode)->i_acl;
+                break;
+        case ACL_TYPE_DEFAULT:
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EINVAL : 0;
+                name = POSIX_ACL_XATTR_DEFAULT;
+                p_acl = &BTRFS_I(inode)->i_default_acl;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                size = posix_acl_xattr_size(acl->a_count);
+                value = kmalloc(size, GFP_NOFS);
+                if (!value) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ret = posix_acl_to_xattr(acl, value, size);
+                if (ret < 0)
+                        goto out;
+        }
+        ret = __btrfs_setxattr(inode, name, value, size, 0);
+out:
+        kfree(value);
+        if (!ret)
+                btrfs_update_cached_acl(inode, p_acl, acl);
+        return ret;
+}
+static int btrfs_xattr_set_acl(struct inode *inode, int type,
+                               const void *value, size_t size)
+{
+        int ret = 0;
+        struct posix_acl *acl = NULL;
+        if (value) {
+                acl = posix_acl_from_xattr(value, size);
+                if (acl == NULL) {
+                        value = NULL;
+                        size = 0;
+                } else if (IS_ERR(acl)) {
+                        return PTR_ERR(acl);
+                }
+        }
+        ret = btrfs_set_acl(inode, acl, type);
+        posix_acl_release(acl);
+        return ret;
+}
+static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
+                                      void *value, size_t size)
+{
+        return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
+                                      const void *value, size_t size, int flags)
+{
+        return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
+                                       void *value, size_t size)
+{
+        return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
+                               const void *value, size_t size, int flags)
+{
+        return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl;
+        int error = -EAGAIN;
+        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+        }
+        return error;
+}
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that.  If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+        struct posix_acl *acl = NULL;
+        int ret = 0;
+        /* this happens with subvols */
+        if (!dir)
+                return 0;
+        if (!S_ISLNK(inode->i_mode)) {
+                if (IS_POSIXACL(dir)) {
+                        acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
+                        if (IS_ERR(acl))
+                                return PTR_ERR(acl);
+                }
+                if (!acl)
+                        inode->i_mode &= ~current->fs->umask;
+        }
+        if (IS_POSIXACL(dir) && acl) {
+                struct posix_acl *clone;
+                mode_t mode;
+                if (S_ISDIR(inode->i_mode)) {
+                        ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
+                        if (ret)
+                                goto failed;
+                }
+                clone = posix_acl_clone(acl, GFP_NOFS);
+                ret = -ENOMEM;
+                if (!clone)
+                        goto failed;
+                mode = inode->i_mode;
+                ret = posix_acl_create_masq(clone, &mode);
+                if (ret >= 0) {
+                        inode->i_mode = mode;
+                        if (ret > 0) {
+                                /* we need an acl */
+                                ret = btrfs_set_acl(inode, clone,
+                                                    ACL_TYPE_ACCESS);
+                        }
+                }
+        }
+failed:
+        posix_acl_release(acl);
+        return ret;
+}
+int btrfs_acl_chmod(struct inode *inode)
+{
+        struct posix_acl *acl, *clone;
+        int ret = 0;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!IS_POSIXACL(inode))
+                return 0;
+        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        ret = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!ret)
+                ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
+        posix_acl_release(clone);
+        return ret;
+}
+struct xattr_handler btrfs_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .get    = btrfs_xattr_acl_default_get,
+        .set    = btrfs_xattr_acl_default_set,
+};
+struct xattr_handler btrfs_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .get    = btrfs_xattr_acl_access_get,
+        .set    = btrfs_xattr_acl_access_set,
+};
+#else /* CONFIG_FS_POSIX_ACL */
+int btrfs_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+        return 0;
+}
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+        return 0;
+}
+#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..8e2fec05dbe0
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+# include <linux/freezer.h>
+#include "async-thread.h"
+#define WORK_QUEUED_BIT 0
+#define WORK_DONE_BIT 1
+#define WORK_ORDER_DONE_BIT 2
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+        /* pool we belong to */
+        struct btrfs_workers *workers;
+        /* list of struct btrfs_work that are waiting for service */
+        struct list_head pending;
+        /* list of worker threads from struct btrfs_workers */
+        struct list_head worker_list;
+        /* kthread */
+        struct task_struct *task;
+        /* number of things on the pending list */
+        atomic_t num_pending;
+        unsigned long sequence;
+        /* protects the pending list. */
+        spinlock_t lock;
+        /* set to non-zero when this thread is already awake and kicking */
+        int working;
+        /* are we currently idle */
+        int idle;
+};
+/*
+ * helper function to move a thread onto the idle list after it
+ * has finished some requests.
+ */
+static void check_idle_worker(struct btrfs_worker_thread *worker)
+{
+        if (!worker->idle && atomic_read(&worker->num_pending) <
+            worker->workers->idle_thresh / 2) {
+                unsigned long flags;
+                spin_lock_irqsave(&worker->workers->lock, flags);
+                worker->idle = 1;
+                list_move(&worker->worker_list, &worker->workers->idle_list);
+                spin_unlock_irqrestore(&worker->workers->lock, flags);
+        }
+}
+/*
+ * helper function to move a thread off the idle list after new
+ * pending work is added.
+ */
+static void check_busy_worker(struct btrfs_worker_thread *worker)
+{
+        if (worker->idle && atomic_read(&worker->num_pending) >=
+            worker->workers->idle_thresh) {
+                unsigned long flags;
+                spin_lock_irqsave(&worker->workers->lock, flags);
+                worker->idle = 0;
+                list_move_tail(&worker->worker_list,
+                               &worker->workers->worker_list);
+                spin_unlock_irqrestore(&worker->workers->lock, flags);
+        }
+}
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+                                            struct btrfs_work *work)
+{
+        unsigned long flags;
+        if (!workers->ordered)
+                return 0;
+        set_bit(WORK_DONE_BIT, &work->flags);
+        spin_lock_irqsave(&workers->lock, flags);
+        while (!list_empty(&workers->order_list)) {
+                work = list_entry(workers->order_list.next,
+                                  struct btrfs_work, order_list);
+                if (!test_bit(WORK_DONE_BIT, &work->flags))
+                        break;
+                /* we are going to call the ordered done function, but
+                 * we leave the work item on the list as a barrier so
+                 * that later work items that are done don't have their
+                 * functions called before this one returns
+                 */
+                if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+                        break;
+                spin_unlock_irqrestore(&workers->lock, flags);
+                work->ordered_func(work);
+                /* now take the lock again and call the freeing code */
+                spin_lock_irqsave(&workers->lock, flags);
+                list_del(&work->order_list);
+                work->ordered_free(work);
+        }
+        spin_unlock_irqrestore(&workers->lock, flags);
+        return 0;
+}
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+        struct btrfs_worker_thread *worker = arg;
+        struct list_head *cur;
+        struct btrfs_work *work;
+        do {
+                spin_lock_irq(&worker->lock);
+                while (!list_empty(&worker->pending)) {
+                        cur = worker->pending.next;
+                        work = list_entry(cur, struct btrfs_work, list);
+                        list_del(&work->list);
+                        clear_bit(WORK_QUEUED_BIT, &work->flags);
+                        work->worker = worker;
+                        spin_unlock_irq(&worker->lock);
+                        work->func(work);
+                        atomic_dec(&worker->num_pending);
+                        /*
+                         * unless this is an ordered work queue,
+                         * 'work' was probably freed by func above.
+                         */
+                        run_ordered_completions(worker->workers, work);
+                        spin_lock_irq(&worker->lock);
+                        check_idle_worker(worker);
+                }
+                worker->working = 0;
+                if (freezing(current)) {
+                        refrigerator();
+                } else {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        spin_unlock_irq(&worker->lock);
+                        if (!kthread_should_stop())
+                                schedule();
+                        __set_current_state(TASK_RUNNING);
+                }
+        } while (!kthread_should_stop());
+        return 0;
+}
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+        struct list_head *cur;
+        struct btrfs_worker_thread *worker;
+        list_splice_init(&workers->idle_list, &workers->worker_list);
+        while (!list_empty(&workers->worker_list)) {
+                cur = workers->worker_list.next;
+                worker = list_entry(cur, struct btrfs_worker_thread,
+                                    worker_list);
+                kthread_stop(worker->task);
+                list_del(&worker->worker_list);
+                kfree(worker);
+        }
+        return 0;
+}
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+{
+        workers->num_workers = 0;
+        INIT_LIST_HEAD(&workers->worker_list);
+        INIT_LIST_HEAD(&workers->idle_list);
+        INIT_LIST_HEAD(&workers->order_list);
+        spin_lock_init(&workers->lock);
+        workers->max_workers = max;
+        workers->idle_thresh = 32;
+        workers->name = name;
+        workers->ordered = 0;
+}
+/*
+ * starts new worker threads.  This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+        struct btrfs_worker_thread *worker;
+        int ret = 0;
+        int i;
+        for (i = 0; i < num_workers; i++) {
+                worker = kzalloc(sizeof(*worker), GFP_NOFS);
+                if (!worker) {
+                        ret = -ENOMEM;
+                        goto fail;
+                }
+                INIT_LIST_HEAD(&worker->pending);
+                INIT_LIST_HEAD(&worker->worker_list);
+                spin_lock_init(&worker->lock);
+                atomic_set(&worker->num_pending, 0);
+                worker->task = kthread_run(worker_loop, worker,
+                                           "btrfs-%s-%d", workers->name,
+                                           workers->num_workers + i);
+                worker->workers = workers;
+                if (IS_ERR(worker->task)) {
+                        kfree(worker);
+                        ret = PTR_ERR(worker->task);
+                        goto fail;
+                }
+                spin_lock_irq(&workers->lock);
+                list_add_tail(&worker->worker_list, &workers->idle_list);
+                worker->idle = 1;
+                workers->num_workers++;
+                spin_unlock_irq(&workers->lock);
+        }
+        return 0;
+fail:
+        btrfs_stop_workers(workers);
+        return ret;
+}
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now.  This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+        struct btrfs_worker_thread *worker;
+        struct list_head *next;
+        int enforce_min = workers->num_workers < workers->max_workers;
+        /*
+         * if we find an idle thread, don't move it to the end of the
+         * idle list.  This improves the chance that the next submission
+         * will reuse the same thread, and maybe catch it while it is still
+         * working
+         */
+        if (!list_empty(&workers->idle_list)) {
+                next = workers->idle_list.next;
+                worker = list_entry(next, struct btrfs_worker_thread,
+                                    worker_list);
+                return worker;
+        }
+        if (enforce_min || list_empty(&workers->worker_list))
+                return NULL;
+        /*
+         * if we pick a busy task, move the task to the end of the list.
+         * hopefully this will keep things somewhat evenly balanced.
+         * Do the move in batches based on the sequence number.  This groups
+         * requests submitted at roughly the same time onto the same worker.
+         */
+        next = workers->worker_list.next;
+        worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+        atomic_inc(&worker->num_pending);
+        worker->sequence++;
+        if (worker->sequence % workers->idle_thresh == 0)
+                list_move_tail(next, &workers->worker_list);
+        return worker;
+}
+/*
+ * selects a worker thread to take the next job.  This will either find
+ * an idle worker, start a new worker up to the max count, or just return
+ * one of the existing busy workers.
+ */
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+        struct btrfs_worker_thread *worker;
+        unsigned long flags;
+again:
+        spin_lock_irqsave(&workers->lock, flags);
+        worker = next_worker(workers);
+        spin_unlock_irqrestore(&workers->lock, flags);
+        if (!worker) {
+                spin_lock_irqsave(&workers->lock, flags);
+                if (workers->num_workers >= workers->max_workers) {
+                        struct list_head *fallback = NULL;
+                        /*
+                         * we have failed to find any workers, just
+                         * return the force one
+                         */
+                        if (!list_empty(&workers->worker_list))
+                                fallback = workers->worker_list.next;
+                        if (!list_empty(&workers->idle_list))
+                                fallback = workers->idle_list.next;
+                        BUG_ON(!fallback);
+                        worker = list_entry(fallback,
+                                  struct btrfs_worker_thread, worker_list);
+                        spin_unlock_irqrestore(&workers->lock, flags);
+                } else {
+                        spin_unlock_irqrestore(&workers->lock, flags);
+                        /* we're below the limit, start another worker */
+                        btrfs_start_workers(workers, 1);
+                        goto again;
+                }
+        }
+        return worker;
+}
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from.  It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+        struct btrfs_worker_thread *worker = work->worker;
+        unsigned long flags;
+        if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+                goto out;
+        spin_lock_irqsave(&worker->lock, flags);
+        atomic_inc(&worker->num_pending);
+        list_add_tail(&work->list, &worker->pending);
+        /* by definition we're busy, take ourselves off the idle
+         * list
+         */
+        if (worker->idle) {
+                spin_lock_irqsave(&worker->workers->lock, flags);
+                worker->idle = 0;
+                list_move_tail(&worker->worker_list,
+                               &worker->workers->worker_list);
+                spin_unlock_irqrestore(&worker->workers->lock, flags);
+        }
+        spin_unlock_irqrestore(&worker->lock, flags);
+out:
+        return 0;
+}
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+        struct btrfs_worker_thread *worker;
+        unsigned long flags;
+        int wake = 0;
+        /* don't requeue something already on a list */
+        if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+                goto out;
+        worker = find_worker(workers);
+        if (workers->ordered) {
+                spin_lock_irqsave(&workers->lock, flags);
+                list_add_tail(&work->order_list, &workers->order_list);
+                spin_unlock_irqrestore(&workers->lock, flags);
+        } else {
+                INIT_LIST_HEAD(&work->order_list);
+        }
+        spin_lock_irqsave(&worker->lock, flags);
+        atomic_inc(&worker->num_pending);
+        check_busy_worker(worker);
+        list_add_tail(&work->list, &worker->pending);
+        /*
+         * avoid calling into wake_up_process if this thread has already
+         * been kicked
+         */
+        if (!worker->working)
+                wake = 1;
+        worker->working = 1;
+        spin_unlock_irqrestore(&worker->lock, flags);
+        if (wake)
+                wake_up_process(worker->task);
+out:
+        return 0;
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..31be4ed8b63e
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+struct btrfs_worker_thread;
+/*
+ * This is similar to a workqueue, but it is meant to spread the operations
+ * across all available cpus instead of just the CPU that was used to
+ * queue the work.  There is also some batching introduced to try and
+ * cut down on context switches.
+ *
+ * By default threads are added on demand up to 2 * the number of cpus.
+ * Changing struct btrfs_workers->max_workers is one way to prevent
+ * demand creation of kthreads.
+ *
+ * the basic model of these worker threads is to embed a btrfs_work
+ * structure in your own data struct, and use container_of in a
+ * work function to get back to your data struct.
+ */
+struct btrfs_work {
+        /*
+         * func should be set to the function you want called
+         * your work struct is passed as the only arg
+         *
+         * ordered_func must be set for work sent to an ordered work queue,
+         * and it is called to complete a given work item in the same
+         * order they were sent to the queue.
+         */
+        void (*func)(struct btrfs_work *work);
+        void (*ordered_func)(struct btrfs_work *work);
+        void (*ordered_free)(struct btrfs_work *work);
+        /*
+         * flags should be set to zero.  It is used to make sure the
+         * struct is only inserted once into the list.
+         */
+        unsigned long flags;
+        /* don't touch these */
+        struct btrfs_worker_thread *worker;
+        struct list_head list;
+        struct list_head order_list;
+};
+struct btrfs_workers {
+        /* current number of running workers */
+        int num_workers;
+        /* max number of workers allowed.  changed by btrfs_start_workers */
+        int max_workers;
+        /* once a worker has this many requests or fewer, it is idle */
+        int idle_thresh;
+        /* force completions in the order they were queued */
+        int ordered;
+        /* list with all the work threads.  The workers on the idle thread
+         * may be actively servicing jobs, but they haven't yet hit the
+         * idle thresh limit above.
+         */
+        struct list_head worker_list;
+        struct list_head idle_list;
+        /*
+         * when operating in ordered mode, this maintains the list
+         * of work items waiting for completion
+         */
+        struct list_head order_list;
+        /* lock for finding the next worker thread to queue on */
+        spinlock_t lock;
+        /* extra name for this worker, used for current->name */
+        char *name;
+};
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+int btrfs_requeue_work(struct btrfs_work *work);
+#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..a8c9693b75ac
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+#include "extent_map.h"
+#include "extent_io.h"
+#include "ordered-data.h"
+/* in memory btrfs inode */
+struct btrfs_inode {
+        /* which subvolume this inode belongs to */
+        struct btrfs_root *root;
+        /* key used to find this inode on disk.  This is used by the code
+         * to read in roots of subvolumes
+         */
+        struct btrfs_key location;
+        /* the extent_tree has caches of all the extent mappings to disk */
+        struct extent_map_tree extent_tree;
+        /* the io_tree does range state (DIRTY, LOCKED etc) */
+        struct extent_io_tree io_tree;
+        /* special utility tree used to record which mirrors have already been
+         * tried when checksums fail for a given block
+         */
+        struct extent_io_tree io_failure_tree;
+        /* held while inesrting or deleting extents from files */
+        struct mutex extent_mutex;
+        /* held while logging the inode in tree-log.c */
+        struct mutex log_mutex;
+        /* used to order data wrt metadata */
+        struct btrfs_ordered_inode_tree ordered_tree;
+        /* standard acl pointers */
+        struct posix_acl *i_acl;
+        struct posix_acl *i_default_acl;
+        /* for keeping track of orphaned inodes */
+        struct list_head i_orphan;
+        /* list of all the delalloc inodes in the FS.  There are times we need
+         * to write all the delalloc pages to disk, and this list is used
+         * to walk them all.
+         */
+        struct list_head delalloc_inodes;
+        /* full 64 bit generation number, struct vfs_inode doesn't have a big
+         * enough field for this.
+         */
+        u64 generation;
+        /* sequence number for NFS changes */
+        u64 sequence;
+        /*
+         * transid of the trans_handle that last modified this inode
+         */
+        u64 last_trans;
+        /*
+         * transid that last logged this inode
+         */
+        u64 logged_trans;
+        /*
+         * trans that last made a change that should be fully fsync'd.  This
+         * gets reset to zero each time the inode is logged
+         */
+        u64 log_dirty_trans;
+        /* total number of bytes pending delalloc, used by stat to calc the
+         * real block usage of the file
+         */
+        u64 delalloc_bytes;
+        /*
+         * the size of the file stored in the metadata on disk.  data=ordered
+         * means the in-memory i_size might be larger than the size on disk
+         * because not all the blocks are written yet.
+         */
+        u64 disk_i_size;
+        /* flags field from the on disk inode */
+        u32 flags;
+        /*
+         * if this is a directory then index_cnt is the counter for the index
+         * number for new files that are created
+         */
+        u64 index_cnt;
+        /* the start of block group preferred for allocations. */
+        u64 block_group;
+        struct inode vfs_inode;
+};
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+        return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+        inode->i_size = size;
+        BTRFS_I(inode)->disk_i_size = size;
+}
+#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..7c4503ef6efd
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
+#ifndef _COMPAT_H_
+#define _COMPAT_H_
+#define btrfs_drop_nlink(inode) drop_nlink(inode)
+#define btrfs_inc_nlink(inode)  inc_nlink(inode)
+#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..ee848d8585d9
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include <linux/pagevec.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+struct compressed_bio {
+        /* number of bios pending for this compressed extent */
+        atomic_t pending_bios;
+        /* the pages with the compressed data on them */
+        struct page **compressed_pages;
+        /* inode that owns this data */
+        struct inode *inode;
+        /* starting offset in the inode for our pages */
+        u64 start;
+        /* number of bytes in the inode we're working on */
+        unsigned long len;
+        /* number of bytes on disk */
+        unsigned long compressed_len;
+        /* number of compressed pages in the array */
+        unsigned long nr_pages;
+        /* IO errors */
+        int errors;
+        int mirror_num;
+        /* for reads, this is the bio we are copying the data into */
+        struct bio *orig_bio;
+        /*
+         * the start of a variable length array of checksums only
+         * used by reads
+         */
+        u32 sums;
+};
+static inline int compressed_bio_size(struct btrfs_root *root,
+                                      unsigned long disk_size)
+{
+        u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+        return sizeof(struct compressed_bio) +
+                ((disk_size + root->sectorsize - 1) / root->sectorsize) *
+                csum_size;
+}
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+                                        u64 first_byte, gfp_t gfp_flags)
+{
+        struct bio *bio;
+        int nr_vecs;
+        nr_vecs = bio_get_nr_vecs(bdev);
+        bio = bio_alloc(gfp_flags, nr_vecs);
+        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+                while (!bio && (nr_vecs /= 2))
+                        bio = bio_alloc(gfp_flags, nr_vecs);
+        }
+        if (bio) {
+                bio->bi_size = 0;
+                bio->bi_bdev = bdev;
+                bio->bi_sector = first_byte >> 9;
+        }
+        return bio;
+}
+static int check_compressed_csum(struct inode *inode,
+                                 struct compressed_bio *cb,
+                                 u64 disk_start)
+{
+        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct page *page;
+        unsigned long i;
+        char *kaddr;
+        u32 csum;
+        u32 *cb_sum = &cb->sums;
+        if (btrfs_test_flag(inode, NODATASUM))
+                return 0;
+        for (i = 0; i < cb->nr_pages; i++) {
+                page = cb->compressed_pages[i];
+                csum = ~(u32)0;
+                kaddr = kmap_atomic(page, KM_USER0);
+                csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+                btrfs_csum_final(csum, (char *)&csum);
+                kunmap_atomic(kaddr, KM_USER0);
+                if (csum != *cb_sum) {
+                        printk(KERN_INFO "btrfs csum failed ino %lu "
+                               "extent %llu csum %u "
+                               "wanted %u mirror %d\n", inode->i_ino,
+                               (unsigned long long)disk_start,
+                               csum, *cb_sum, cb->mirror_num);
+                        ret = -EIO;
+                        goto fail;
+                }
+                cb_sum++;
+        }
+        ret = 0;
+fail:
+        return ret;
+}
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+        struct extent_io_tree *tree;
+        struct compressed_bio *cb = bio->bi_private;
+        struct inode *inode;
+        struct page *page;
+        unsigned long index;
+        int ret;
+        if (err)
+                cb->errors = 1;
+        /* if there are more bios still pending for this compressed
+         * extent, just exit
+         */
+        if (!atomic_dec_and_test(&cb->pending_bios))
+                goto out;
+        inode = cb->inode;
+        ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+        if (ret)
+                goto csum_failed;
+        /* ok, we're the last bio for this extent, lets start
+         * the decompression.
+         */
+        tree = &BTRFS_I(inode)->io_tree;
+        ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+                                        cb->start,
+                                        cb->orig_bio->bi_io_vec,
+                                        cb->orig_bio->bi_vcnt,
+                                        cb->compressed_len);
+csum_failed:
+        if (ret)
+                cb->errors = 1;
+        /* release the compressed pages */
+        index = 0;
+        for (index = 0; index < cb->nr_pages; index++) {
+                page = cb->compressed_pages[index];
+                page->mapping = NULL;
+                page_cache_release(page);
+        }
+        /* do io completion on the original bio */
+        if (cb->errors) {
+                bio_io_error(cb->orig_bio);
+        } else {
+                int bio_index = 0;
+                struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+                /*
+                 * we have verified the checksum already, set page
+                 * checked so the end_io handlers know about it
+                 */
+                while (bio_index < cb->orig_bio->bi_vcnt) {
+                        SetPageChecked(bvec->bv_page);
+                        bvec++;
+                        bio_index++;
+                }
+                bio_endio(cb->orig_bio, 0);
+        }
+        /* finally free the cb struct */
+        kfree(cb->compressed_pages);
+        kfree(cb);
+out:
+        bio_put(bio);
+}
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+                                             unsigned long ram_size)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+        struct page *pages[16];
+        unsigned long nr_pages = end_index - index + 1;
+        int i;
+        int ret;
+        while (nr_pages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min_t(unsigned long,
+                                     nr_pages, ARRAY_SIZE(pages)), pages);
+                if (ret == 0) {
+                        nr_pages -= 1;
+                        index += 1;
+                        continue;
+                }
+                for (i = 0; i < ret; i++) {
+                        end_page_writeback(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                nr_pages -= ret;
+                index += ret;
+        }
+        /* the inode may be gone now */
+        return 0;
+}
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+        struct extent_io_tree *tree;
+        struct compressed_bio *cb = bio->bi_private;
+        struct inode *inode;
+        struct page *page;
+        unsigned long index;
+        if (err)
+                cb->errors = 1;
+        /* if there are more bios still pending for this compressed
+         * extent, just exit
+         */
+        if (!atomic_dec_and_test(&cb->pending_bios))
+                goto out;
+        /* ok, we're the last bio for this extent, step one is to
+         * call back into the FS and do all the end_io operations
+         */
+        inode = cb->inode;
+        tree = &BTRFS_I(inode)->io_tree;
+        cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+        tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+                                         cb->start,
+                                         cb->start + cb->len - 1,
+                                         NULL, 1);
+        cb->compressed_pages[0]->mapping = NULL;
+        end_compressed_writeback(inode, cb->start, cb->len);
+        /* note, our inode could be gone now */
+        /*
+         * release the compressed pages, these came from alloc_page and
+         * are not attached to the inode at all
+         */
+        index = 0;
+        for (index = 0; index < cb->nr_pages; index++) {
+                page = cb->compressed_pages[index];
+                page->mapping = NULL;
+                page_cache_release(page);
+        }
+        /* finally free the cb struct */
+        kfree(cb->compressed_pages);
+        kfree(cb);
+out:
+        bio_put(bio);
+}
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                 unsigned long len, u64 disk_start,
+                                 unsigned long compressed_len,
+                                 struct page **compressed_pages,
+                                 unsigned long nr_pages)
+{
+        struct bio *bio = NULL;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct compressed_bio *cb;
+        unsigned long bytes_left;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int page_index = 0;
+        struct page *page;
+        u64 first_byte = disk_start;
+        struct block_device *bdev;
+        int ret;
+        WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+        atomic_set(&cb->pending_bios, 0);
+        cb->errors = 0;
+        cb->inode = inode;
+        cb->start = start;
+        cb->len = len;
+        cb->mirror_num = 0;
+        cb->compressed_pages = compressed_pages;
+        cb->compressed_len = compressed_len;
+        cb->orig_bio = NULL;
+        cb->nr_pages = nr_pages;
+        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+        bio->bi_private = cb;
+        bio->bi_end_io = end_compressed_bio_write;
+        atomic_inc(&cb->pending_bios);
+        /* create and submit bios for the compressed pages */
+        bytes_left = compressed_len;
+        for (page_index = 0; page_index < cb->nr_pages; page_index++) {
+                page = compressed_pages[page_index];
+                page->mapping = inode->i_mapping;
+                if (bio->bi_size)
+                        ret = io_tree->ops->merge_bio_hook(page, 0,
+                                                           PAGE_CACHE_SIZE,
+                                                           bio, 0);
+                else
+                        ret = 0;
+                page->mapping = NULL;
+                if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+                    PAGE_CACHE_SIZE) {
+                        bio_get(bio);
+                        /*
+                         * inc the count before we submit the bio so
+                         * we know the end IO handler won't happen before
+                         * we inc the count.  Otherwise, the cb might get
+                         * freed before we're done setting it up
+                         */
+                        atomic_inc(&cb->pending_bios);
+                        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+                        BUG_ON(ret);
+                        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+                        BUG_ON(ret);
+                        bio_put(bio);
+                        bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+                        bio->bi_private = cb;
+                        bio->bi_end_io = end_compressed_bio_write;
+                        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+                }
+                if (bytes_left < PAGE_CACHE_SIZE) {
+                        printk("bytes left %lu compress len %lu nr %lu\n",
+                               bytes_left, cb->compressed_len, cb->nr_pages);
+                }
+                bytes_left -= PAGE_CACHE_SIZE;
+                first_byte += PAGE_CACHE_SIZE;
+                cond_resched();
+        }
+        bio_get(bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        BUG_ON(ret);
+        ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+        BUG_ON(ret);
+        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+        BUG_ON(ret);
+        bio_put(bio);
+        return 0;
+}
+static noinline int add_ra_bio_pages(struct inode *inode,
+                                     u64 compressed_end,
+                                     struct compressed_bio *cb)
+{
+        unsigned long end_index;
+        unsigned long page_index;
+        u64 last_offset;
+        u64 isize = i_size_read(inode);
+        int ret;
+        struct page *page;
+        unsigned long nr_pages = 0;
+        struct extent_map *em;
+        struct address_space *mapping = inode->i_mapping;
+        struct pagevec pvec;
+        struct extent_map_tree *em_tree;
+        struct extent_io_tree *tree;
+        u64 end;
+        int misses = 0;
+        page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+        last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+        em_tree = &BTRFS_I(inode)->extent_tree;
+        tree = &BTRFS_I(inode)->io_tree;
+        if (isize == 0)
+                return 0;
+        end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+        pagevec_init(&pvec, 0);
+        while (last_offset < compressed_end) {
+                page_index = last_offset >> PAGE_CACHE_SHIFT;
+                if (page_index > end_index)
+                        break;
+                rcu_read_lock();
+                page = radix_tree_lookup(&mapping->page_tree, page_index);
+                rcu_read_unlock();
+                if (page) {
+                        misses++;
+                        if (misses > 4)
+                                break;
+                        goto next;
+                }
+                page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+                if (!page)
+                        break;
+                page->index = page_index;
+                /*
+                 * what we want to do here is call add_to_page_cache_lru,
+                 * but that isn't exported, so we reproduce it here
+                 */
+                if (add_to_page_cache(page, mapping,
+                                      page->index, GFP_NOFS)) {
+                        page_cache_release(page);
+                        goto next;
+                }
+                /* open coding of lru_cache_add, also not exported */
+                page_cache_get(page);
+                if (!pagevec_add(&pvec, page))
+                        __pagevec_lru_add_file(&pvec);
+                end = last_offset + PAGE_CACHE_SIZE - 1;
+                /*
+                 * at this point, we have a locked page in the page cache
+                 * for these bytes in the file.  But, we have to make
+                 * sure they map to this compressed extent on disk.
+                 */
+                set_page_extent_mapped(page);
+                lock_extent(tree, last_offset, end, GFP_NOFS);
+                spin_lock(&em_tree->lock);
+                em = lookup_extent_mapping(em_tree, last_offset,
+                                           PAGE_CACHE_SIZE);
+                spin_unlock(&em_tree->lock);
+                if (!em || last_offset < em->start ||
+                    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+                    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+                        free_extent_map(em);
+                        unlock_extent(tree, last_offset, end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        break;
+                }
+                free_extent_map(em);
+                if (page->index == end_index) {
+                        char *userpage;
+                        size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+                        if (zero_offset) {
+                                int zeros;
+                                zeros = PAGE_CACHE_SIZE - zero_offset;
+                                userpage = kmap_atomic(page, KM_USER0);
+                                memset(userpage + zero_offset, 0, zeros);
+                                flush_dcache_page(page);
+                                kunmap_atomic(userpage, KM_USER0);
+                        }
+                }
+                ret = bio_add_page(cb->orig_bio, page,
+                                   PAGE_CACHE_SIZE, 0);
+                if (ret == PAGE_CACHE_SIZE) {
+                        nr_pages++;
+                        page_cache_release(page);
+                } else {
+                        unlock_extent(tree, last_offset, end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        break;
+                }
+next:
+                last_offset += PAGE_CACHE_SIZE;
+        }
+        if (pagevec_count(&pvec))
+                __pagevec_lru_add_file(&pvec);
+        return 0;
+}
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags)
+{
+        struct extent_io_tree *tree;
+        struct extent_map_tree *em_tree;
+        struct compressed_bio *cb;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+        unsigned long compressed_len;
+        unsigned long nr_pages;
+        unsigned long page_index;
+        struct page *page;
+        struct block_device *bdev;
+        struct bio *comp_bio;
+        u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+        u64 em_len;
+        u64 em_start;
+        struct extent_map *em;
+        int ret;
+        u32 *sums;
+        tree = &BTRFS_I(inode)->io_tree;
+        em_tree = &BTRFS_I(inode)->extent_tree;
+        /* we need the actual starting offset of this extent in the file */
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree,
+                                   page_offset(bio->bi_io_vec->bv_page),
+                                   PAGE_CACHE_SIZE);
+        spin_unlock(&em_tree->lock);
+        compressed_len = em->block_len;
+        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+        atomic_set(&cb->pending_bios, 0);
+        cb->errors = 0;
+        cb->inode = inode;
+        cb->mirror_num = mirror_num;
+        sums = &cb->sums;
+        cb->start = em->orig_start;
+        em_len = em->len;
+        em_start = em->start;
+        free_extent_map(em);
+        em = NULL;
+        cb->len = uncompressed_len;
+        cb->compressed_len = compressed_len;
+        cb->orig_bio = bio;
+        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+                                 PAGE_CACHE_SIZE;
+        cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+                                       GFP_NOFS);
+        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        for (page_index = 0; page_index < nr_pages; page_index++) {
+                cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+                                                              __GFP_HIGHMEM);
+        }
+        cb->nr_pages = nr_pages;
+        add_ra_bio_pages(inode, em_start + em_len, cb);
+        /* include any pages we added in add_ra-bio_pages */
+        uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+        cb->len = uncompressed_len;
+        comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+        comp_bio->bi_private = cb;
+        comp_bio->bi_end_io = end_compressed_bio_read;
+        atomic_inc(&cb->pending_bios);
+        for (page_index = 0; page_index < nr_pages; page_index++) {
+                page = cb->compressed_pages[page_index];
+                page->mapping = inode->i_mapping;
+                page->index = em_start >> PAGE_CACHE_SHIFT;
+                if (comp_bio->bi_size)
+                        ret = tree->ops->merge_bio_hook(page, 0,
+                                                        PAGE_CACHE_SIZE,
+                                                        comp_bio, 0);
+                else
+                        ret = 0;
+                page->mapping = NULL;
+                if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+                    PAGE_CACHE_SIZE) {
+                        bio_get(comp_bio);
+                        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+                        BUG_ON(ret);
+                        /*
+                         * inc the count before we submit the bio so
+                         * we know the end IO handler won't happen before
+                         * we inc the count.  Otherwise, the cb might get
+                         * freed before we're done setting it up
+                         */
+                        atomic_inc(&cb->pending_bios);
+                        if (!btrfs_test_flag(inode, NODATASUM)) {
+                                btrfs_lookup_bio_sums(root, inode, comp_bio,
+                                                      sums);
+                        }
+                        sums += (comp_bio->bi_size + root->sectorsize - 1) /
+                                root->sectorsize;
+                        ret = btrfs_map_bio(root, READ, comp_bio,
+                                            mirror_num, 0);
+                        BUG_ON(ret);
+                        bio_put(comp_bio);
+                        comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+                                                        GFP_NOFS);
+                        comp_bio->bi_private = cb;
+                        comp_bio->bi_end_io = end_compressed_bio_read;
+                        bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+                }
+                cur_disk_byte += PAGE_CACHE_SIZE;
+        }
+        bio_get(comp_bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+        BUG_ON(ret);
+        if (!btrfs_test_flag(inode, NODATASUM))
+                btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
+        BUG_ON(ret);
+        bio_put(comp_bio);
+        return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+int btrfs_zlib_decompress(unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                              u64 disk_start,
+                              struct bio_vec *bvec,
+                              int vcnt,
+                              size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                  unsigned long len, u64 disk_start,
+                                  unsigned long compressed_len,
+                                  struct page **compressed_pages,
+                                  unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..6e1b3de36700
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_CRC32C__
+#define __BTRFS_CRC32C__
+#include <linux/crc32c.h>
+/*
+ * this file used to do more for selecting the HW version of crc32c,
+ * perhaps it will one day again soon.
+ */
+#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..9e46c0776816
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3953 @@
+/*
+ * Copyright (C) 2007,2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "locking.h"
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *ins_key,
+                      struct btrfs_path *path, int data_size, int extend);
+static int push_node_left(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct extent_buffer *dst,
+                          struct extent_buffer *src, int empty);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct extent_buffer *dst_buf,
+                              struct extent_buffer *src_buf);
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_path *path, int level, int slot);
+inline void btrfs_init_path(struct btrfs_path *p)
+{
+        memset(p, 0, sizeof(*p));
+}
+struct btrfs_path *btrfs_alloc_path(void)
+{
+        struct btrfs_path *path;
+        path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
+        if (path) {
+                btrfs_init_path(path);
+                path->reada = 1;
+        }
+        return path;
+}
+/* this also releases the path */
+void btrfs_free_path(struct btrfs_path *p)
+{
+        btrfs_release_path(NULL, p);
+        kmem_cache_free(btrfs_path_cachep, p);
+}
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
+noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+{
+        int i;
+        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+                p->slots[i] = 0;
+                if (!p->nodes[i])
+                        continue;
+                if (p->locks[i]) {
+                        btrfs_tree_unlock(p->nodes[i]);
+                        p->locks[i] = 0;
+                }
+                free_extent_buffer(p->nodes[i]);
+                p->nodes[i] = NULL;
+        }
+}
+/*
+ * safely gets a reference on the root node of a tree.  A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree.  See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear.  It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+        struct extent_buffer *eb;
+        spin_lock(&root->node_lock);
+        eb = root->node;
+        extent_buffer_get(eb);
+        spin_unlock(&root->node_lock);
+        return eb;
+}
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+        struct extent_buffer *eb;
+        while (1) {
+                eb = btrfs_root_node(root);
+                btrfs_tree_lock(eb);
+                spin_lock(&root->node_lock);
+                if (eb == root->node) {
+                        spin_unlock(&root->node_lock);
+                        break;
+                }
+                spin_unlock(&root->node_lock);
+                btrfs_tree_unlock(eb);
+                free_extent_buffer(eb);
+        }
+        return eb;
+}
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list.  transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+        if (root->track_dirty && list_empty(&root->dirty_list)) {
+                list_add(&root->dirty_list,
+                         &root->fs_info->dirty_cowonly_roots);
+        }
+}
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid.  The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      struct extent_buffer *buf,
+                      struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+        struct extent_buffer *cow;
+        u32 nritems;
+        int ret = 0;
+        int level;
+        struct btrfs_root *new_root;
+        new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
+        if (!new_root)
+                return -ENOMEM;
+        memcpy(new_root, root, sizeof(*new_root));
+        new_root->root_key.objectid = new_root_objectid;
+        WARN_ON(root->ref_cows && trans->transid !=
+                root->fs_info->running_transaction->transid);
+        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+        level = btrfs_header_level(buf);
+        nritems = btrfs_header_nritems(buf);
+        cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+                                     new_root_objectid, trans->transid,
+                                     level, buf->start, 0);
+        if (IS_ERR(cow)) {
+                kfree(new_root);
+                return PTR_ERR(cow);
+        }
+        copy_extent_buffer(cow, buf, 0, 0, cow->len);
+        btrfs_set_header_bytenr(cow, cow->start);
+        btrfs_set_header_generation(cow, trans->transid);
+        btrfs_set_header_owner(cow, new_root_objectid);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        write_extent_buffer(cow, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(cow),
+                            BTRFS_FSID_SIZE);
+        WARN_ON(btrfs_header_generation(buf) > trans->transid);
+        ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
+        kfree(new_root);
+        if (ret)
+                return ret;
+        btrfs_mark_buffer_dirty(cow);
+        *cow_ret = cow;
+        return 0;
+}
+/*
+ * does the dirty work in cow of a single block.  The parent block (if
+ * supplied) is updated to point to the new cow copy.  The new buffer is marked
+ * dirty and returned locked.  If you modify the block it needs to be marked
+ * dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
+ *
+ * prealloc_dest -- if you have already reserved a destination for the cow,
+ * this uses that block instead of allocating a new one.
+ * btrfs_alloc_reserved_extent is used to finish the allocation.
+ */
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct extent_buffer *buf,
+                             struct extent_buffer *parent, int parent_slot,
+                             struct extent_buffer **cow_ret,
+                             u64 search_start, u64 empty_size,
+                             u64 prealloc_dest)
+{
+        u64 parent_start;
+        struct extent_buffer *cow;
+        u32 nritems;
+        int ret = 0;
+        int level;
+        int unlock_orig = 0;
+        if (*cow_ret == buf)
+                unlock_orig = 1;
+        WARN_ON(!btrfs_tree_locked(buf));
+        if (parent)
+                parent_start = parent->start;
+        else
+                parent_start = 0;
+        WARN_ON(root->ref_cows && trans->transid !=
+                root->fs_info->running_transaction->transid);
+        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+        level = btrfs_header_level(buf);
+        nritems = btrfs_header_nritems(buf);
+        if (prealloc_dest) {
+                struct btrfs_key ins;
+                ins.objectid = prealloc_dest;
+                ins.offset = buf->len;
+                ins.type = BTRFS_EXTENT_ITEM_KEY;
+                ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
+                                                  root->root_key.objectid,
+                                                  trans->transid, level, &ins);
+                BUG_ON(ret);
+                cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
+                                            buf->len);
+        } else {
+                cow = btrfs_alloc_free_block(trans, root, buf->len,
+                                             parent_start,
+                                             root->root_key.objectid,
+                                             trans->transid, level,
+                                             search_start, empty_size);
+        }
+        if (IS_ERR(cow))
+                return PTR_ERR(cow);
+        copy_extent_buffer(cow, buf, 0, 0, cow->len);
+        btrfs_set_header_bytenr(cow, cow->start);
+        btrfs_set_header_generation(cow, trans->transid);
+        btrfs_set_header_owner(cow, root->root_key.objectid);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        write_extent_buffer(cow, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(cow),
+                            BTRFS_FSID_SIZE);
+        WARN_ON(btrfs_header_generation(buf) > trans->transid);
+        if (btrfs_header_generation(buf) != trans->transid) {
+                u32 nr_extents;
+                ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
+                if (ret)
+                        return ret;
+                ret = btrfs_cache_ref(trans, root, buf, nr_extents);
+                WARN_ON(ret);
+        } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
+                /*
+                 * There are only two places that can drop reference to
+                 * tree blocks owned by living reloc trees, one is here,
+                 * the other place is btrfs_drop_subtree. In both places,
+                 * we check reference count while tree block is locked.
+                 * Furthermore, if reference count is one, it won't get
+                 * increased by someone else.
+                 */
+                u32 refs;
+                ret = btrfs_lookup_extent_ref(trans, root, buf->start,
+                                              buf->len, &refs);
+                BUG_ON(ret);
+                if (refs == 1) {
+                        ret = btrfs_update_ref(trans, root, buf, cow,
+                                               0, nritems);
+                        clean_tree_block(trans, root, buf);
+                } else {
+                        ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
+                }
+                BUG_ON(ret);
+        } else {
+                ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
+                if (ret)
+                        return ret;
+                clean_tree_block(trans, root, buf);
+        }
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
+                WARN_ON(ret);
+        }
+        if (buf == root->node) {
+                WARN_ON(parent && parent != buf);
+                spin_lock(&root->node_lock);
+                root->node = cow;
+                extent_buffer_get(cow);
+                spin_unlock(&root->node_lock);
+                if (buf != root->commit_root) {
+                        btrfs_free_extent(trans, root, buf->start,
+                                          buf->len, buf->start,
+                                          root->root_key.objectid,
+                                          btrfs_header_generation(buf),
+                                          level, 1);
+                }
+                free_extent_buffer(buf);
+                add_root_to_dirty_list(root);
+        } else {
+                btrfs_set_node_blockptr(parent, parent_slot,
+                                        cow->start);
+                WARN_ON(trans->transid == 0);
+                btrfs_set_node_ptr_generation(parent, parent_slot,
+                                              trans->transid);
+                btrfs_mark_buffer_dirty(parent);
+                WARN_ON(btrfs_header_generation(parent) != trans->transid);
+                btrfs_free_extent(trans, root, buf->start, buf->len,
+                                  parent_start, btrfs_header_owner(parent),
+                                  btrfs_header_generation(parent), level, 1);
+        }
+        if (unlock_orig)
+                btrfs_tree_unlock(buf);
+        free_extent_buffer(buf);
+        btrfs_mark_buffer_dirty(cow);
+        *cow_ret = cow;
+        return 0;
+}
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct extent_buffer *buf,
+                    struct extent_buffer *parent, int parent_slot,
+                    struct extent_buffer **cow_ret, u64 prealloc_dest)
+{
+        u64 search_start;
+        int ret;
+        if (trans->transaction != root->fs_info->running_transaction) {
+                printk(KERN_CRIT "trans %llu running %llu\n",
+                       (unsigned long long)trans->transid,
+                       (unsigned long long)
+                       root->fs_info->running_transaction->transid);
+                WARN_ON(1);
+        }
+        if (trans->transid != root->fs_info->generation) {
+                printk(KERN_CRIT "trans %llu running %llu\n",
+                       (unsigned long long)trans->transid,
+                       (unsigned long long)root->fs_info->generation);
+                WARN_ON(1);
+        }
+        spin_lock(&root->fs_info->hash_lock);
+        if (btrfs_header_generation(buf) == trans->transid &&
+            btrfs_header_owner(buf) == root->root_key.objectid &&
+            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+                *cow_ret = buf;
+                spin_unlock(&root->fs_info->hash_lock);
+                WARN_ON(prealloc_dest);
+                return 0;
+        }
+        spin_unlock(&root->fs_info->hash_lock);
+        search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+        ret = __btrfs_cow_block(trans, root, buf, parent,
+                                 parent_slot, cow_ret, search_start, 0,
+                                 prealloc_dest);
+        return ret;
+}
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+        if (blocknr < other && other - (blocknr + blocksize) < 32768)
+                return 1;
+        if (blocknr > other && blocknr - (other + blocksize) < 32768)
+                return 1;
+        return 0;
+}
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+{
+        struct btrfs_key k1;
+        btrfs_disk_key_to_cpu(&k1, disk);
+        if (k1.objectid > k2->objectid)
+                return 1;
+        if (k1.objectid < k2->objectid)
+                return -1;
+        if (k1.type > k2->type)
+                return 1;
+        if (k1.type < k2->type)
+                return -1;
+        if (k1.offset > k2->offset)
+                return 1;
+        if (k1.offset < k2->offset)
+                return -1;
+        return 0;
+}
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+        if (k1->objectid > k2->objectid)
+                return 1;
+        if (k1->objectid < k2->objectid)
+                return -1;
+        if (k1->type > k2->type)
+                return 1;
+        if (k1->type < k2->type)
+                return -1;
+        if (k1->offset > k2->offset)
+                return 1;
+        if (k1->offset < k2->offset)
+                return -1;
+        return 0;
+}
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct extent_buffer *parent,
+                       int start_slot, int cache_only, u64 *last_ret,
+                       struct btrfs_key *progress)
+{
+        struct extent_buffer *cur;
+        u64 blocknr;
+        u64 gen;
+        u64 search_start = *last_ret;
+        u64 last_block = 0;
+        u64 other;
+        u32 parent_nritems;
+        int end_slot;
+        int i;
+        int err = 0;
+        int parent_level;
+        int uptodate;
+        u32 blocksize;
+        int progress_passed = 0;
+        struct btrfs_disk_key disk_key;
+        parent_level = btrfs_header_level(parent);
+        if (cache_only && parent_level != 1)
+                return 0;
+        if (trans->transaction != root->fs_info->running_transaction)
+                WARN_ON(1);
+        if (trans->transid != root->fs_info->generation)
+                WARN_ON(1);
+        parent_nritems = btrfs_header_nritems(parent);
+        blocksize = btrfs_level_size(root, parent_level - 1);
+        end_slot = parent_nritems;
+        if (parent_nritems == 1)
+                return 0;
+        for (i = start_slot; i < end_slot; i++) {
+                int close = 1;
+                if (!parent->map_token) {
+                        map_extent_buffer(parent,
+                                        btrfs_node_key_ptr_offset(i),
+                                        sizeof(struct btrfs_key_ptr),
+                                        &parent->map_token, &parent->kaddr,
+                                        &parent->map_start, &parent->map_len,
+                                        KM_USER1);
+                }
+                btrfs_node_key(parent, &disk_key, i);
+                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+                        continue;
+                progress_passed = 1;
+                blocknr = btrfs_node_blockptr(parent, i);
+                gen = btrfs_node_ptr_generation(parent, i);
+                if (last_block == 0)
+                        last_block = blocknr;
+                if (i > 0) {
+                        other = btrfs_node_blockptr(parent, i - 1);
+                        close = close_blocks(blocknr, other, blocksize);
+                }
+                if (!close && i < end_slot - 2) {
+                        other = btrfs_node_blockptr(parent, i + 1);
+                        close = close_blocks(blocknr, other, blocksize);
+                }
+                if (close) {
+                        last_block = blocknr;
+                        continue;
+                }
+                if (parent->map_token) {
+                        unmap_extent_buffer(parent, parent->map_token,
+                                            KM_USER1);
+                        parent->map_token = NULL;
+                }
+                cur = btrfs_find_tree_block(root, blocknr, blocksize);
+                if (cur)
+                        uptodate = btrfs_buffer_uptodate(cur, gen);
+                else
+                        uptodate = 0;
+                if (!cur || !uptodate) {
+                        if (cache_only) {
+                                free_extent_buffer(cur);
+                                continue;
+                        }
+                        if (!cur) {
+                                cur = read_tree_block(root, blocknr,
+                                                         blocksize, gen);
+                        } else if (!uptodate) {
+                                btrfs_read_buffer(cur, gen);
+                        }
+                }
+                if (search_start == 0)
+                        search_start = last_block;
+                btrfs_tree_lock(cur);
+                err = __btrfs_cow_block(trans, root, cur, parent, i,
+                                        &cur, search_start,
+                                        min(16 * blocksize,
+                                            (end_slot - i) * blocksize), 0);
+                if (err) {
+                        btrfs_tree_unlock(cur);
+                        free_extent_buffer(cur);
+                        break;
+                }
+                search_start = cur->start;
+                last_block = cur->start;
+                *last_ret = search_start;
+                btrfs_tree_unlock(cur);
+                free_extent_buffer(cur);
+        }
+        if (parent->map_token) {
+                unmap_extent_buffer(parent, parent->map_token,
+                                    KM_USER1);
+                parent->map_token = NULL;
+        }
+        return err;
+}
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+                                         struct extent_buffer *leaf)
+{
+        u32 nr = btrfs_header_nritems(leaf);
+        if (nr == 0)
+                return BTRFS_LEAF_DATA_SIZE(root);
+        return btrfs_item_offset_nr(leaf, nr - 1);
+}
+/*
+ * extra debugging checks to make sure all the items in a key are
+ * well formed and in the proper order
+ */
+static int check_node(struct btrfs_root *root, struct btrfs_path *path,
+                      int level)
+{
+        struct extent_buffer *parent = NULL;
+        struct extent_buffer *node = path->nodes[level];
+        struct btrfs_disk_key parent_key;
+        struct btrfs_disk_key node_key;
+        int parent_slot;
+        int slot;
+        struct btrfs_key cpukey;
+        u32 nritems = btrfs_header_nritems(node);
+        if (path->nodes[level + 1])
+                parent = path->nodes[level + 1];
+        slot = path->slots[level];
+        BUG_ON(nritems == 0);
+        if (parent) {
+                parent_slot = path->slots[level + 1];
+                btrfs_node_key(parent, &parent_key, parent_slot);
+                btrfs_node_key(node, &node_key, 0);
+                BUG_ON(memcmp(&parent_key, &node_key,
+                              sizeof(struct btrfs_disk_key)));
+                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+                       btrfs_header_bytenr(node));
+        }
+        BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
+        if (slot != 0) {
+                btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
+                btrfs_node_key(node, &node_key, slot);
+                BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
+        }
+        if (slot < nritems - 1) {
+                btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
+                btrfs_node_key(node, &node_key, slot);
+                BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
+        }
+        return 0;
+}
+/*
+ * extra checking to make sure all the items in a leaf are
+ * well formed and in the proper order
+ */
+static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
+                      int level)
+{
+        struct extent_buffer *leaf = path->nodes[level];
+        struct extent_buffer *parent = NULL;
+        int parent_slot;
+        struct btrfs_key cpukey;
+        struct btrfs_disk_key parent_key;
+        struct btrfs_disk_key leaf_key;
+        int slot = path->slots[0];
+        u32 nritems = btrfs_header_nritems(leaf);
+        if (path->nodes[level + 1])
+                parent = path->nodes[level + 1];
+        if (nritems == 0)
+                return 0;
+        if (parent) {
+                parent_slot = path->slots[level + 1];
+                btrfs_node_key(parent, &parent_key, parent_slot);
+                btrfs_item_key(leaf, &leaf_key, 0);
+                BUG_ON(memcmp(&parent_key, &leaf_key,
+                       sizeof(struct btrfs_disk_key)));
+                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+                       btrfs_header_bytenr(leaf));
+        }
+        if (slot != 0 && slot < nritems - 1) {
+                btrfs_item_key(leaf, &leaf_key, slot);
+                btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
+                if (comp_keys(&leaf_key, &cpukey) <= 0) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d offset bad key\n", slot);
+                        BUG_ON(1);
+                }
+                if (btrfs_item_offset_nr(leaf, slot - 1) !=
+                       btrfs_item_end_nr(leaf, slot)) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d offset bad\n", slot);
+                        BUG_ON(1);
+                }
+        }
+        if (slot < nritems - 1) {
+                btrfs_item_key(leaf, &leaf_key, slot);
+                btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
+                BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
+                if (btrfs_item_offset_nr(leaf, slot) !=
+                        btrfs_item_end_nr(leaf, slot + 1)) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d offset bad\n", slot);
+                        BUG_ON(1);
+                }
+        }
+        BUG_ON(btrfs_item_offset_nr(leaf, 0) +
+               btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
+        return 0;
+}
+static noinline int check_block(struct btrfs_root *root,
+                                struct btrfs_path *path, int level)
+{
+        return 0;
+        if (level == 0)
+                return check_leaf(root, path, level);
+        return check_node(root, path, level);
+}
+/*
+ * search for key in the extent_buffer.  The items start at offset p,
+ * and they are item_size apart.  There are 'max' items in p.
+ *
+ * the slot in the array is returned via slot, and it points to
+ * the place where you would insert key if it is not found in
+ * the array.
+ *
+ * slot may point to max if the key is bigger than all of the keys
+ */
+static noinline int generic_bin_search(struct extent_buffer *eb,
+                                       unsigned long p,
+                                       int item_size, struct btrfs_key *key,
+                                       int max, int *slot)
+{
+        int low = 0;
+        int high = max;
+        int mid;
+        int ret;
+        struct btrfs_disk_key *tmp = NULL;
+        struct btrfs_disk_key unaligned;
+        unsigned long offset;
+        char *map_token = NULL;
+        char *kaddr = NULL;
+        unsigned long map_start = 0;
+        unsigned long map_len = 0;
+        int err;
+        while (low < high) {
+                mid = (low + high) / 2;
+                offset = p + mid * item_size;
+                if (!map_token || offset < map_start ||
+                    (offset + sizeof(struct btrfs_disk_key)) >
+                    map_start + map_len) {
+                        if (map_token) {
+                                unmap_extent_buffer(eb, map_token, KM_USER0);
+                                map_token = NULL;
+                        }
+                        err = map_private_extent_buffer(eb, offset,
+                                                sizeof(struct btrfs_disk_key),
+                                                &map_token, &kaddr,
+                                                &map_start, &map_len, KM_USER0);
+                        if (!err) {
+                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
+                                                        map_start);
+                        } else {
+                                read_extent_buffer(eb, &unaligned,
+                                                   offset, sizeof(unaligned));
+                                tmp = &unaligned;
+                        }
+                } else {
+                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
+                                                        map_start);
+                }
+                ret = comp_keys(tmp, key);
+                if (ret < 0)
+                        low = mid + 1;
+                else if (ret > 0)
+                        high = mid;
+                else {
+                        *slot = mid;
+                        if (map_token)
+                                unmap_extent_buffer(eb, map_token, KM_USER0);
+                        return 0;
+                }
+        }
+        *slot = low;
+        if (map_token)
+                unmap_extent_buffer(eb, map_token, KM_USER0);
+        return 1;
+}
+/*
+ * simple bin_search frontend that does the right thing for
+ * leaves vs nodes
+ */
+static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+                      int level, int *slot)
+{
+        if (level == 0) {
+                return generic_bin_search(eb,
+                                          offsetof(struct btrfs_leaf, items),
+                                          sizeof(struct btrfs_item),
+                                          key, btrfs_header_nritems(eb),
+                                          slot);
+        } else {
+                return generic_bin_search(eb,
+                                          offsetof(struct btrfs_node, ptrs),
+                                          sizeof(struct btrfs_key_ptr),
+                                          key, btrfs_header_nritems(eb),
+                                          slot);
+        }
+        return -1;
+}
+/* given a node and slot number, this reads the blocks it points to.  The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
+                                   struct extent_buffer *parent, int slot)
+{
+        int level = btrfs_header_level(parent);
+        if (slot < 0)
+                return NULL;
+        if (slot >= btrfs_header_nritems(parent))
+                return NULL;
+        BUG_ON(level == 0);
+        return read_tree_block(root, btrfs_node_blockptr(parent, slot),
+                       btrfs_level_size(root, level - 1),
+                       btrfs_node_ptr_generation(parent, slot));
+}
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion.  We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
+static noinline int balance_level(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path, int level)
+{
+        struct extent_buffer *right = NULL;
+        struct extent_buffer *mid;
+        struct extent_buffer *left = NULL;
+        struct extent_buffer *parent = NULL;
+        int ret = 0;
+        int wret;
+        int pslot;
+        int orig_slot = path->slots[level];
+        int err_on_enospc = 0;
+        u64 orig_ptr;
+        if (level == 0)
+                return 0;
+        mid = path->nodes[level];
+        WARN_ON(!path->locks[level]);
+        WARN_ON(btrfs_header_generation(mid) != trans->transid);
+        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+        if (level < BTRFS_MAX_LEVEL - 1)
+                parent = path->nodes[level + 1];
+        pslot = path->slots[level + 1];
+        /*
+         * deal with the case where there is only one pointer in the root
+         * by promoting the node below to a root
+         */
+        if (!parent) {
+                struct extent_buffer *child;
+                if (btrfs_header_nritems(mid) != 1)
+                        return 0;
+                /* promote the child to a root */
+                child = read_node_slot(root, mid, 0);
+                btrfs_tree_lock(child);
+                BUG_ON(!child);
+                ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+                BUG_ON(ret);
+                spin_lock(&root->node_lock);
+                root->node = child;
+                spin_unlock(&root->node_lock);
+                ret = btrfs_update_extent_ref(trans, root, child->start,
+                                              mid->start, child->start,
+                                              root->root_key.objectid,
+                                              trans->transid, level - 1);
+                BUG_ON(ret);
+                add_root_to_dirty_list(root);
+                btrfs_tree_unlock(child);
+                path->locks[level] = 0;
+                path->nodes[level] = NULL;
+                clean_tree_block(trans, root, mid);
+                btrfs_tree_unlock(mid);
+                /* once for the path */
+                free_extent_buffer(mid);
+                ret = btrfs_free_extent(trans, root, mid->start, mid->len,
+                                        mid->start, root->root_key.objectid,
+                                        btrfs_header_generation(mid),
+                                        level, 1);
+                /* once for the root ptr */
+                free_extent_buffer(mid);
+                return ret;
+        }
+        if (btrfs_header_nritems(mid) >
+            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
+                return 0;
+        if (btrfs_header_nritems(mid) < 2)
+                err_on_enospc = 1;
+        left = read_node_slot(root, parent, pslot - 1);
+        if (left) {
+                btrfs_tree_lock(left);
+                wret = btrfs_cow_block(trans, root, left,
+                                       parent, pslot - 1, &left, 0);
+                if (wret) {
+                        ret = wret;
+                        goto enospc;
+                }
+        }
+        right = read_node_slot(root, parent, pslot + 1);
+        if (right) {
+                btrfs_tree_lock(right);
+                wret = btrfs_cow_block(trans, root, right,
+                                       parent, pslot + 1, &right, 0);
+                if (wret) {
+                        ret = wret;
+                        goto enospc;
+                }
+        }
+        /* first, try to make some room in the middle buffer */
+        if (left) {
+                orig_slot += btrfs_header_nritems(left);
+                wret = push_node_left(trans, root, left, mid, 1);
+                if (wret < 0)
+                        ret = wret;
+                if (btrfs_header_nritems(mid) < 2)
+                        err_on_enospc = 1;
+        }
+        /*
+         * then try to empty the right most buffer into the middle
+         */
+        if (right) {
+                wret = push_node_left(trans, root, mid, right, 1);
+                if (wret < 0 && wret != -ENOSPC)
+                        ret = wret;
+                if (btrfs_header_nritems(right) == 0) {
+                        u64 bytenr = right->start;
+                        u64 generation = btrfs_header_generation(parent);
+                        u32 blocksize = right->len;
+                        clean_tree_block(trans, root, right);
+                        btrfs_tree_unlock(right);
+                        free_extent_buffer(right);
+                        right = NULL;
+                        wret = del_ptr(trans, root, path, level + 1, pslot +
+                                       1);
+                        if (wret)
+                                ret = wret;
+                        wret = btrfs_free_extent(trans, root, bytenr,
+                                                 blocksize, parent->start,
+                                                 btrfs_header_owner(parent),
+                                                 generation, level, 1);
+                        if (wret)
+                                ret = wret;
+                } else {
+                        struct btrfs_disk_key right_key;
+                        btrfs_node_key(right, &right_key, 0);
+                        btrfs_set_node_key(parent, &right_key, pslot + 1);
+                        btrfs_mark_buffer_dirty(parent);
+                }
+        }
+        if (btrfs_header_nritems(mid) == 1) {
+                /*
+                 * we're not allowed to leave a node with one item in the
+                 * tree during a delete.  A deletion from lower in the tree
+                 * could try to delete the only pointer in this node.
+                 * So, pull some keys from the left.
+                 * There has to be a left pointer at this point because
+                 * otherwise we would have pulled some pointers from the
+                 * right
+                 */
+                BUG_ON(!left);
+                wret = balance_node_right(trans, root, mid, left);
+                if (wret < 0) {
+                        ret = wret;
+                        goto enospc;
+                }
+                if (wret == 1) {
+                        wret = push_node_left(trans, root, left, mid, 1);
+                        if (wret < 0)
+                                ret = wret;
+                }
+                BUG_ON(wret == 1);
+        }
+        if (btrfs_header_nritems(mid) == 0) {
+                /* we've managed to empty the middle node, drop it */
+                u64 root_gen = btrfs_header_generation(parent);
+                u64 bytenr = mid->start;
+                u32 blocksize = mid->len;
+                clean_tree_block(trans, root, mid);
+                btrfs_tree_unlock(mid);
+                free_extent_buffer(mid);
+                mid = NULL;
+                wret = del_ptr(trans, root, path, level + 1, pslot);
+                if (wret)
+                        ret = wret;
+                wret = btrfs_free_extent(trans, root, bytenr, blocksize,
+                                         parent->start,
+                                         btrfs_header_owner(parent),
+                                         root_gen, level, 1);
+                if (wret)
+                        ret = wret;
+        } else {
+                /* update the parent key to reflect our changes */
+                struct btrfs_disk_key mid_key;
+                btrfs_node_key(mid, &mid_key, 0);
+                btrfs_set_node_key(parent, &mid_key, pslot);
+                btrfs_mark_buffer_dirty(parent);
+        }
+        /* update the path */
+        if (left) {
+                if (btrfs_header_nritems(left) > orig_slot) {
+                        extent_buffer_get(left);
+                        /* left was locked after cow */
+                        path->nodes[level] = left;
+                        path->slots[level + 1] -= 1;
+                        path->slots[level] = orig_slot;
+                        if (mid) {
+                                btrfs_tree_unlock(mid);
+                                free_extent_buffer(mid);
+                        }
+                } else {
+                        orig_slot -= btrfs_header_nritems(left);
+                        path->slots[level] = orig_slot;
+                }
+        }
+        /* double check we haven't messed things up */
+        check_block(root, path, level);
+        if (orig_ptr !=
+            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
+                BUG();
+enospc:
+        if (right) {
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        if (left) {
+                if (path->nodes[level] != left)
+                        btrfs_tree_unlock(left);
+                free_extent_buffer(left);
+        }
+        return ret;
+}
+/* Node balancing for insertion.  Here we only split or push nodes around
+ * when they are completely full.  This is also done top down, so we
+ * have to be pessimistic.
+ */
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path, int level)
+{
+        struct extent_buffer *right = NULL;
+        struct extent_buffer *mid;
+        struct extent_buffer *left = NULL;
+        struct extent_buffer *parent = NULL;
+        int ret = 0;
+        int wret;
+        int pslot;
+        int orig_slot = path->slots[level];
+        u64 orig_ptr;
+        if (level == 0)
+                return 1;
+        mid = path->nodes[level];
+        WARN_ON(btrfs_header_generation(mid) != trans->transid);
+        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+        if (level < BTRFS_MAX_LEVEL - 1)
+                parent = path->nodes[level + 1];
+        pslot = path->slots[level + 1];
+        if (!parent)
+                return 1;
+        left = read_node_slot(root, parent, pslot - 1);
+        /* first, try to make some room in the middle buffer */
+        if (left) {
+                u32 left_nr;
+                btrfs_tree_lock(left);
+                left_nr = btrfs_header_nritems(left);
+                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+                        wret = 1;
+                } else {
+                        ret = btrfs_cow_block(trans, root, left, parent,
+                                              pslot - 1, &left, 0);
+                        if (ret)
+                                wret = 1;
+                        else {
+                                wret = push_node_left(trans, root,
+                                                      left, mid, 0);
+                        }
+                }
+                if (wret < 0)
+                        ret = wret;
+                if (wret == 0) {
+                        struct btrfs_disk_key disk_key;
+                        orig_slot += left_nr;
+                        btrfs_node_key(mid, &disk_key, 0);
+                        btrfs_set_node_key(parent, &disk_key, pslot);
+                        btrfs_mark_buffer_dirty(parent);
+                        if (btrfs_header_nritems(left) > orig_slot) {
+                                path->nodes[level] = left;
+                                path->slots[level + 1] -= 1;
+                                path->slots[level] = orig_slot;
+                                btrfs_tree_unlock(mid);
+                                free_extent_buffer(mid);
+                        } else {
+                                orig_slot -=
+                                        btrfs_header_nritems(left);
+                                path->slots[level] = orig_slot;
+                                btrfs_tree_unlock(left);
+                                free_extent_buffer(left);
+                        }
+                        return 0;
+                }
+                btrfs_tree_unlock(left);
+                free_extent_buffer(left);
+        }
+        right = read_node_slot(root, parent, pslot + 1);
+        /*
+         * then try to empty the right most buffer into the middle
+         */
+        if (right) {
+                u32 right_nr;
+                btrfs_tree_lock(right);
+                right_nr = btrfs_header_nritems(right);
+                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+                        wret = 1;
+                } else {
+                        ret = btrfs_cow_block(trans, root, right,
+                                              parent, pslot + 1,
+                                              &right, 0);
+                        if (ret)
+                                wret = 1;
+                        else {
+                                wret = balance_node_right(trans, root,
+                                                          right, mid);
+                        }
+                }
+                if (wret < 0)
+                        ret = wret;
+                if (wret == 0) {
+                        struct btrfs_disk_key disk_key;
+                        btrfs_node_key(right, &disk_key, 0);
+                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
+                        btrfs_mark_buffer_dirty(parent);
+                        if (btrfs_header_nritems(mid) <= orig_slot) {
+                                path->nodes[level] = right;
+                                path->slots[level + 1] += 1;
+                                path->slots[level] = orig_slot -
+                                        btrfs_header_nritems(mid);
+                                btrfs_tree_unlock(mid);
+                                free_extent_buffer(mid);
+                        } else {
+                                btrfs_tree_unlock(right);
+                                free_extent_buffer(right);
+                        }
+                        return 0;
+                }
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        return 1;
+}
+/*
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
+ */
+static noinline void reada_for_search(struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      int level, int slot, u64 objectid)
+{
+        struct extent_buffer *node;
+        struct btrfs_disk_key disk_key;
+        u32 nritems;
+        u64 search;
+        u64 lowest_read;
+        u64 highest_read;
+        u64 nread = 0;
+        int direction = path->reada;
+        struct extent_buffer *eb;
+        u32 nr;
+        u32 blocksize;
+        u32 nscan = 0;
+        if (level != 1)
+                return;
+        if (!path->nodes[level])
+                return;
+        node = path->nodes[level];
+        search = btrfs_node_blockptr(node, slot);
+        blocksize = btrfs_level_size(root, level - 1);
+        eb = btrfs_find_tree_block(root, search, blocksize);
+        if (eb) {
+                free_extent_buffer(eb);
+                return;
+        }
+        highest_read = search;
+        lowest_read = search;
+        nritems = btrfs_header_nritems(node);
+        nr = slot;
+        while (1) {
+                if (direction < 0) {
+                        if (nr == 0)
+                                break;
+                        nr--;
+                } else if (direction > 0) {
+                        nr++;
+                        if (nr >= nritems)
+                                break;
+                }
+                if (path->reada < 0 && objectid) {
+                        btrfs_node_key(node, &disk_key, nr);
+                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
+                                break;
+                }
+                search = btrfs_node_blockptr(node, nr);
+                if ((search >= lowest_read && search <= highest_read) ||
+                    (search < lowest_read && lowest_read - search <= 16384) ||
+                    (search > highest_read && search - highest_read <= 16384)) {
+                        readahead_tree_block(root, search, blocksize,
+                                     btrfs_node_ptr_generation(node, nr));
+                        nread += blocksize;
+                }
+                nscan++;
+                if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
+                        break;
+                if (nread > (256 * 1024) || nscan > 128)
+                        break;
+                if (search < lowest_read)
+                        lowest_read = search;
+                if (search > highest_read)
+                        highest_read = search;
+        }
+}
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree.  The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block.  This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
+ * if lowest_unlock is 1, level 0 won't be unlocked
+ */
+static noinline void unlock_up(struct btrfs_path *path, int level,
+                               int lowest_unlock)
+{
+        int i;
+        int skip_level = level;
+        int no_skips = 0;
+        struct extent_buffer *t;
+        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+                if (!path->nodes[i])
+                        break;
+                if (!path->locks[i])
+                        break;
+                if (!no_skips && path->slots[i] == 0) {
+                        skip_level = i + 1;
+                        continue;
+                }
+                if (!no_skips && path->keep_locks) {
+                        u32 nritems;
+                        t = path->nodes[i];
+                        nritems = btrfs_header_nritems(t);
+                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
+                                skip_level = i + 1;
+                                continue;
+                        }
+                }
+                if (skip_level < i && i >= lowest_unlock)
+                        no_skips = 1;
+                t = path->nodes[i];
+                if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+                        btrfs_tree_unlock(t);
+                        path->locks[i] = 0;
+                }
+        }
+}
+/*
+ * look for key in the tree.  path is filled in with nodes along the way
+ * if key is found, we return zero and you can find the item in the leaf
+ * level of the path (level 0)
+ *
+ * If the key isn't found, the path points to the slot where it should
+ * be inserted, and 1 is returned.  If there are other errors during the
+ * search a negative error number is returned.
+ *
+ * if ins_len > 0, nodes and leaves will be split as we walk down the
+ * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
+ * possible)
+ */
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_path *p, int
+                      ins_len, int cow)
+{
+        struct extent_buffer *b;
+        struct extent_buffer *tmp;
+        int slot;
+        int ret;
+        int level;
+        int should_reada = p->reada;
+        int lowest_unlock = 1;
+        int blocksize;
+        u8 lowest_level = 0;
+        u64 blocknr;
+        u64 gen;
+        struct btrfs_key prealloc_block;
+        lowest_level = p->lowest_level;
+        WARN_ON(lowest_level && ins_len > 0);
+        WARN_ON(p->nodes[0] != NULL);
+        if (ins_len < 0)
+                lowest_unlock = 2;
+        prealloc_block.objectid = 0;
+again:
+        if (p->skip_locking)
+                b = btrfs_root_node(root);
+        else
+                b = btrfs_lock_root_node(root);
+        while (b) {
+                level = btrfs_header_level(b);
+                /*
+                 * setup the path here so we can release it under lock
+                 * contention with the cow code
+                 */
+                p->nodes[level] = b;
+                if (!p->skip_locking)
+                        p->locks[level] = 1;
+                if (cow) {
+                        int wret;
+                        /* is a cow on this block not required */
+                        spin_lock(&root->fs_info->hash_lock);
+                        if (btrfs_header_generation(b) == trans->transid &&
+                            btrfs_header_owner(b) == root->root_key.objectid &&
+                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
+                                spin_unlock(&root->fs_info->hash_lock);
+                                goto cow_done;
+                        }
+                        spin_unlock(&root->fs_info->hash_lock);
+                        /* ok, we have to cow, is our old prealloc the right
+                         * size?
+                         */
+                        if (prealloc_block.objectid &&
+                            prealloc_block.offset != b->len) {
+                                btrfs_free_reserved_extent(root,
+                                           prealloc_block.objectid,
+                                           prealloc_block.offset);
+                                prealloc_block.objectid = 0;
+                        }
+                        /*
+                         * for higher level blocks, try not to allocate blocks
+                         * with the block and the parent locks held.
+                         */
+                        if (level > 1 && !prealloc_block.objectid &&
+                            btrfs_path_lock_waiting(p, level)) {
+                                u32 size = b->len;
+                                u64 hint = b->start;
+                                btrfs_release_path(root, p);
+                                ret = btrfs_reserve_extent(trans, root,
+                                                           size, size, 0,
+                                                           hint, (u64)-1,
+                                                           &prealloc_block, 0);
+                                BUG_ON(ret);
+                                goto again;
+                        }
+                        wret = btrfs_cow_block(trans, root, b,
+                                               p->nodes[level + 1],
+                                               p->slots[level + 1],
+                                               &b, prealloc_block.objectid);
+                        prealloc_block.objectid = 0;
+                        if (wret) {
+                                free_extent_buffer(b);
+                                ret = wret;
+                                goto done;
+                        }
+                }
+cow_done:
+                BUG_ON(!cow && ins_len);
+                if (level != btrfs_header_level(b))
+                        WARN_ON(1);
+                level = btrfs_header_level(b);
+                p->nodes[level] = b;
+                if (!p->skip_locking)
+                        p->locks[level] = 1;
+                ret = check_block(root, p, level);
+                if (ret) {
+                        ret = -1;
+                        goto done;
+                }
+                ret = bin_search(b, key, level, &slot);
+                if (level != 0) {
+                        if (ret && slot > 0)
+                                slot -= 1;
+                        p->slots[level] = slot;
+                        if ((p->search_for_split || ins_len > 0) &&
+                            btrfs_header_nritems(b) >=
+                            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+                                int sret = split_node(trans, root, p, level);
+                                BUG_ON(sret > 0);
+                                if (sret) {
+                                        ret = sret;
+                                        goto done;
+                                }
+                                b = p->nodes[level];
+                                slot = p->slots[level];
+                        } else if (ins_len < 0) {
+                                int sret = balance_level(trans, root, p,
+                                                         level);
+                                if (sret) {
+                                        ret = sret;
+                                        goto done;
+                                }
+                                b = p->nodes[level];
+                                if (!b) {
+                                        btrfs_release_path(NULL, p);
+                                        goto again;
+                                }
+                                slot = p->slots[level];
+                                BUG_ON(btrfs_header_nritems(b) == 1);
+                        }
+                        unlock_up(p, level, lowest_unlock);
+                        /* this is only true while dropping a snapshot */
+                        if (level == lowest_level) {
+                                ret = 0;
+                                goto done;
+                        }
+                        blocknr = btrfs_node_blockptr(b, slot);
+                        gen = btrfs_node_ptr_generation(b, slot);
+                        blocksize = btrfs_level_size(root, level - 1);
+                        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+                        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+                                b = tmp;
+                        } else {
+                                /*
+                                 * reduce lock contention at high levels
+                                 * of the btree by dropping locks before
+                                 * we read.
+                                 */
+                                if (level > 1) {
+                                        btrfs_release_path(NULL, p);
+                                        if (tmp)
+                                                free_extent_buffer(tmp);
+                                        if (should_reada)
+                                                reada_for_search(root, p,
+                                                                 level, slot,
+                                                                 key->objectid);
+                                        tmp = read_tree_block(root, blocknr,
+                                                         blocksize, gen);
+                                        if (tmp)
+                                                free_extent_buffer(tmp);
+                                        goto again;
+                                } else {
+                                        if (tmp)
+                                                free_extent_buffer(tmp);
+                                        if (should_reada)
+                                                reada_for_search(root, p,
+                                                                 level, slot,
+                                                                 key->objectid);
+                                        b = read_node_slot(root, b, slot);
+                                }
+                        }
+                        if (!p->skip_locking)
+                                btrfs_tree_lock(b);
+                } else {
+                        p->slots[level] = slot;
+                        if (ins_len > 0 &&
+                            btrfs_leaf_free_space(root, b) < ins_len) {
+                                int sret = split_leaf(trans, root, key,
+                                                      p, ins_len, ret == 0);
+                                BUG_ON(sret > 0);
+                                if (sret) {
+                                        ret = sret;
+                                        goto done;
+                                }
+                        }
+                        if (!p->search_for_split)
+                                unlock_up(p, level, lowest_unlock);
+                        goto done;
+                }
+        }
+        ret = 1;
+done:
+        if (prealloc_block.objectid) {
+                btrfs_free_reserved_extent(root,
+                           prealloc_block.objectid,
+                           prealloc_block.offset);
+        }
+        return ret;
+}
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_key *node_keys,
+                     u64 *nodes, int lowest_level)
+{
+        struct extent_buffer *eb;
+        struct extent_buffer *parent;
+        struct btrfs_key key;
+        u64 bytenr;
+        u64 generation;
+        u32 blocksize;
+        int level;
+        int slot;
+        int key_match;
+        int ret;
+        eb = btrfs_lock_root_node(root);
+        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+        BUG_ON(ret);
+        parent = eb;
+        while (1) {
+                level = btrfs_header_level(parent);
+                if (level == 0 || level <= lowest_level)
+                        break;
+                ret = bin_search(parent, &node_keys[lowest_level], level,
+                                 &slot);
+                if (ret && slot > 0)
+                        slot--;
+                bytenr = btrfs_node_blockptr(parent, slot);
+                if (nodes[level - 1] == bytenr)
+                        break;
+                blocksize = btrfs_level_size(root, level - 1);
+                generation = btrfs_node_ptr_generation(parent, slot);
+                btrfs_node_key_to_cpu(eb, &key, slot);
+                key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
+                if (generation == trans->transid) {
+                        eb = read_tree_block(root, bytenr, blocksize,
+                                             generation);
+                        btrfs_tree_lock(eb);
+                }
+                /*
+                 * if node keys match and node pointer hasn't been modified
+                 * in the running transaction, we can merge the path. for
+                 * blocks owened by reloc trees, the node pointer check is
+                 * skipped, this is because these blocks are fully controlled
+                 * by the space balance code, no one else can modify them.
+                 */
+                if (!nodes[level - 1] || !key_match ||
+                    (generation == trans->transid &&
+                     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
+                        if (level == 1 || level == lowest_level + 1) {
+                                if (generation == trans->transid) {
+                                        btrfs_tree_unlock(eb);
+                                        free_extent_buffer(eb);
+                                }
+                                break;
+                        }
+                        if (generation != trans->transid) {
+                                eb = read_tree_block(root, bytenr, blocksize,
+                                                generation);
+                                btrfs_tree_lock(eb);
+                        }
+                        ret = btrfs_cow_block(trans, root, eb, parent, slot,
+                                              &eb, 0);
+                        BUG_ON(ret);
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID) {
+                                if (!nodes[level - 1]) {
+                                        nodes[level - 1] = eb->start;
+                                        memcpy(&node_keys[level - 1], &key,
+                                               sizeof(node_keys[0]));
+                                } else {
+                                        WARN_ON(1);
+                                }
+                        }
+                        btrfs_tree_unlock(parent);
+                        free_extent_buffer(parent);
+                        parent = eb;
+                        continue;
+                }
+                btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
+                btrfs_set_node_ptr_generation(parent, slot, trans->transid);
+                btrfs_mark_buffer_dirty(parent);
+                ret = btrfs_inc_extent_ref(trans, root,
+                                        nodes[level - 1],
+                                        blocksize, parent->start,
+                                        btrfs_header_owner(parent),
+                                        btrfs_header_generation(parent),
+                                        level - 1);
+                BUG_ON(ret);
+                /*
+                 * If the block was created in the running transaction,
+                 * it's possible this is the last reference to it, so we
+                 * should drop the subtree.
+                 */
+                if (generation == trans->transid) {
+                        ret = btrfs_drop_subtree(trans, root, eb, parent);
+                        BUG_ON(ret);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
+                } else {
+                        ret = btrfs_free_extent(trans, root, bytenr,
+                                        blocksize, parent->start,
+                                        btrfs_header_owner(parent),
+                                        btrfs_header_generation(parent),
+                                        level - 1, 1);
+                        BUG_ON(ret);
+                }
+                break;
+        }
+        btrfs_tree_unlock(parent);
+        free_extent_buffer(parent);
+        return 0;
+}
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ *
+ * If this fails to write a tree block, it returns -1, but continues
+ * fixing up the blocks in ram so the tree is consistent.
+ */
+static int fixup_low_keys(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct btrfs_path *path,
+                          struct btrfs_disk_key *key, int level)
+{
+        int i;
+        int ret = 0;
+        struct extent_buffer *t;
+        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+                int tslot = path->slots[i];
+                if (!path->nodes[i])
+                        break;
+                t = path->nodes[i];
+                btrfs_set_node_key(t, key, tslot);
+                btrfs_mark_buffer_dirty(path->nodes[i]);
+                if (tslot != 0)
+                        break;
+        }
+        return ret;
+}
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, struct btrfs_path *path,
+                            struct btrfs_key *new_key)
+{
+        struct btrfs_disk_key disk_key;
+        struct extent_buffer *eb;
+        int slot;
+        eb = path->nodes[0];
+        slot = path->slots[0];
+        if (slot > 0) {
+                btrfs_item_key(eb, &disk_key, slot - 1);
+                if (comp_keys(&disk_key, new_key) >= 0)
+                        return -1;
+        }
+        if (slot < btrfs_header_nritems(eb) - 1) {
+                btrfs_item_key(eb, &disk_key, slot + 1);
+                if (comp_keys(&disk_key, new_key) <= 0)
+                        return -1;
+        }
+        btrfs_cpu_key_to_disk(&disk_key, new_key);
+        btrfs_set_item_key(eb, &disk_key, slot);
+        btrfs_mark_buffer_dirty(eb);
+        if (slot == 0)
+                fixup_low_keys(trans, root, path, &disk_key, 1);
+        return 0;
+}
+/*
+ * try to push data from one node into the next node left in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
+ */
+static int push_node_left(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct extent_buffer *dst,
+                          struct extent_buffer *src, int empty)
+{
+        int push_items = 0;
+        int src_nritems;
+        int dst_nritems;
+        int ret = 0;
+        src_nritems = btrfs_header_nritems(src);
+        dst_nritems = btrfs_header_nritems(dst);
+        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+        WARN_ON(btrfs_header_generation(src) != trans->transid);
+        WARN_ON(btrfs_header_generation(dst) != trans->transid);
+        if (!empty && src_nritems <= 8)
+                return 1;
+        if (push_items <= 0)
+                return 1;
+        if (empty) {
+                push_items = min(src_nritems, push_items);
+                if (push_items < src_nritems) {
+                        /* leave at least 8 pointers in the node if
+                         * we aren't going to empty it
+                         */
+                        if (src_nritems - push_items < 8) {
+                                if (push_items <= 8)
+                                        return 1;
+                                push_items -= 8;
+                        }
+                }
+        } else
+                push_items = min(src_nritems - 8, push_items);
+        copy_extent_buffer(dst, src,
+                           btrfs_node_key_ptr_offset(dst_nritems),
+                           btrfs_node_key_ptr_offset(0),
+                           push_items * sizeof(struct btrfs_key_ptr));
+        if (push_items < src_nritems) {
+                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+                                      btrfs_node_key_ptr_offset(push_items),
+                                      (src_nritems - push_items) *
+                                      sizeof(struct btrfs_key_ptr));
+        }
+        btrfs_set_header_nritems(src, src_nritems - push_items);
+        btrfs_set_header_nritems(dst, dst_nritems + push_items);
+        btrfs_mark_buffer_dirty(src);
+        btrfs_mark_buffer_dirty(dst);
+        ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
+        BUG_ON(ret);
+        return ret;
+}
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will  only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct extent_buffer *dst,
+                              struct extent_buffer *src)
+{
+        int push_items = 0;
+        int max_push;
+        int src_nritems;
+        int dst_nritems;
+        int ret = 0;
+        WARN_ON(btrfs_header_generation(src) != trans->transid);
+        WARN_ON(btrfs_header_generation(dst) != trans->transid);
+        src_nritems = btrfs_header_nritems(src);
+        dst_nritems = btrfs_header_nritems(dst);
+        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+        if (push_items <= 0)
+                return 1;
+        if (src_nritems < 4)
+                return 1;
+        max_push = src_nritems / 2 + 1;
+        /* don't try to empty the node */
+        if (max_push >= src_nritems)
+                return 1;
+        if (max_push < push_items)
+                push_items = max_push;
+        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+                                      btrfs_node_key_ptr_offset(0),
+                                      (dst_nritems) *
+                                      sizeof(struct btrfs_key_ptr));
+        copy_extent_buffer(dst, src,
+                           btrfs_node_key_ptr_offset(0),
+                           btrfs_node_key_ptr_offset(src_nritems - push_items),
+                           push_items * sizeof(struct btrfs_key_ptr));
+        btrfs_set_header_nritems(src, src_nritems - push_items);
+        btrfs_set_header_nritems(dst, dst_nritems + push_items);
+        btrfs_mark_buffer_dirty(src);
+        btrfs_mark_buffer_dirty(dst);
+        ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
+        BUG_ON(ret);
+        return ret;
+}
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
+ */
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_path *path, int level)
+{
+        u64 lower_gen;
+        struct extent_buffer *lower;
+        struct extent_buffer *c;
+        struct extent_buffer *old;
+        struct btrfs_disk_key lower_key;
+        int ret;
+        BUG_ON(path->nodes[level]);
+        BUG_ON(path->nodes[level-1] != root->node);
+        lower = path->nodes[level-1];
+        if (level == 1)
+                btrfs_item_key(lower, &lower_key, 0);
+        else
+                btrfs_node_key(lower, &lower_key, 0);
+        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+                                   root->root_key.objectid, trans->transid,
+                                   level, root->node->start, 0);
+        if (IS_ERR(c))
+                return PTR_ERR(c);
+        memset_extent_buffer(c, 0, 0, root->nodesize);
+        btrfs_set_header_nritems(c, 1);
+        btrfs_set_header_level(c, level);
+        btrfs_set_header_bytenr(c, c->start);
+        btrfs_set_header_generation(c, trans->transid);
+        btrfs_set_header_owner(c, root->root_key.objectid);
+        write_extent_buffer(c, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(c),
+                            BTRFS_FSID_SIZE);
+        write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+                            (unsigned long)btrfs_header_chunk_tree_uuid(c),
+                            BTRFS_UUID_SIZE);
+        btrfs_set_node_key(c, &lower_key, 0);
+        btrfs_set_node_blockptr(c, 0, lower->start);
+        lower_gen = btrfs_header_generation(lower);
+        WARN_ON(lower_gen != trans->transid);
+        btrfs_set_node_ptr_generation(c, 0, lower_gen);
+        btrfs_mark_buffer_dirty(c);
+        spin_lock(&root->node_lock);
+        old = root->node;
+        root->node = c;
+        spin_unlock(&root->node_lock);
+        ret = btrfs_update_extent_ref(trans, root, lower->start,
+                                      lower->start, c->start,
+                                      root->root_key.objectid,
+                                      trans->transid, level - 1);
+        BUG_ON(ret);
+        /* the super has an extra ref to root->node */
+        free_extent_buffer(old);
+        add_root_to_dirty_list(root);
+        extent_buffer_get(c);
+        path->nodes[level] = c;
+        path->locks[level] = 1;
+        path->slots[level] = 0;
+        return 0;
+}
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ *
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ *
+ * returns zero on success and < 0 on any error
+ */
+static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_path *path, struct btrfs_disk_key
+                      *key, u64 bytenr, int slot, int level)
+{
+        struct extent_buffer *lower;
+        int nritems;
+        BUG_ON(!path->nodes[level]);
+        lower = path->nodes[level];
+        nritems = btrfs_header_nritems(lower);
+        if (slot > nritems)
+                BUG();
+        if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
+                BUG();
+        if (slot != nritems) {
+                memmove_extent_buffer(lower,
+                              btrfs_node_key_ptr_offset(slot + 1),
+                              btrfs_node_key_ptr_offset(slot),
+                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
+        }
+        btrfs_set_node_key(lower, key, slot);
+        btrfs_set_node_blockptr(lower, slot, bytenr);
+        WARN_ON(trans->transid == 0);
+        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
+        btrfs_set_header_nritems(lower, nritems + 1);
+        btrfs_mark_buffer_dirty(lower);
+        return 0;
+}
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
+ */
+static noinline int split_node(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_path *path, int level)
+{
+        struct extent_buffer *c;
+        struct extent_buffer *split;
+        struct btrfs_disk_key disk_key;
+        int mid;
+        int ret;
+        int wret;
+        u32 c_nritems;
+        c = path->nodes[level];
+        WARN_ON(btrfs_header_generation(c) != trans->transid);
+        if (c == root->node) {
+                /* trying to split the root, lets make a new one */
+                ret = insert_new_root(trans, root, path, level + 1);
+                if (ret)
+                        return ret;
+        } else {
+                ret = push_nodes_for_insert(trans, root, path, level);
+                c = path->nodes[level];
+                if (!ret && btrfs_header_nritems(c) <
+                    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
+                        return 0;
+                if (ret < 0)
+                        return ret;
+        }
+        c_nritems = btrfs_header_nritems(c);
+        split = btrfs_alloc_free_block(trans, root, root->nodesize,
+                                        path->nodes[level + 1]->start,
+                                        root->root_key.objectid,
+                                        trans->transid, level, c->start, 0);
+        if (IS_ERR(split))
+                return PTR_ERR(split);
+        btrfs_set_header_flags(split, btrfs_header_flags(c));
+        btrfs_set_header_level(split, btrfs_header_level(c));
+        btrfs_set_header_bytenr(split, split->start);
+        btrfs_set_header_generation(split, trans->transid);
+        btrfs_set_header_owner(split, root->root_key.objectid);
+        btrfs_set_header_flags(split, 0);
+        write_extent_buffer(split, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(split),
+                            BTRFS_FSID_SIZE);
+        write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+                            (unsigned long)btrfs_header_chunk_tree_uuid(split),
+                            BTRFS_UUID_SIZE);
+        mid = (c_nritems + 1) / 2;
+        copy_extent_buffer(split, c,
+                           btrfs_node_key_ptr_offset(0),
+                           btrfs_node_key_ptr_offset(mid),
+                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+        btrfs_set_header_nritems(split, c_nritems - mid);
+        btrfs_set_header_nritems(c, mid);
+        ret = 0;
+        btrfs_mark_buffer_dirty(c);
+        btrfs_mark_buffer_dirty(split);
+        btrfs_node_key(split, &disk_key, 0);
+        wret = insert_ptr(trans, root, path, &disk_key, split->start,
+                          path->slots[level + 1] + 1,
+                          level + 1);
+        if (wret)
+                ret = wret;
+        ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
+        BUG_ON(ret);
+        if (path->slots[level] >= mid) {
+                path->slots[level] -= mid;
+                btrfs_tree_unlock(c);
+                free_extent_buffer(c);
+                path->nodes[level] = split;
+                path->slots[level + 1] += 1;
+        } else {
+                btrfs_tree_unlock(split);
+                free_extent_buffer(split);
+        }
+        return ret;
+}
+/*
+ * how many bytes are required to store the items in a leaf.  start
+ * and nr indicate which items in the leaf to check.  This totals up the
+ * space used both by the item structs and the item data
+ */
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+{
+        int data_len;
+        int nritems = btrfs_header_nritems(l);
+        int end = min(nritems, start + nr) - 1;
+        if (!nr)
+                return 0;
+        data_len = btrfs_item_end_nr(l, start);
+        data_len = data_len - btrfs_item_offset_nr(l, end);
+        data_len += sizeof(struct btrfs_item) * nr;
+        WARN_ON(data_len < 0);
+        return data_len;
+}
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data.  IOW, how much room
+ * the leaf has left for both items and data
+ */
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
+                                   struct extent_buffer *leaf)
+{
+        int nritems = btrfs_header_nritems(leaf);
+        int ret;
+        ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+        if (ret < 0) {
+                printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+                       "used %d nritems %d\n",
+                       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
+                       leaf_space_used(leaf, 0, nritems), nritems);
+        }
+        return ret;
+}
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+                           *root, struct btrfs_path *path, int data_size,
+                           int empty)
+{
+        struct extent_buffer *left = path->nodes[0];
+        struct extent_buffer *right;
+        struct extent_buffer *upper;
+        struct btrfs_disk_key disk_key;
+        int slot;
+        u32 i;
+        int free_space;
+        int push_space = 0;
+        int push_items = 0;
+        struct btrfs_item *item;
+        u32 left_nritems;
+        u32 nr;
+        u32 right_nritems;
+        u32 data_end;
+        u32 this_item_size;
+        int ret;
+        slot = path->slots[1];
+        if (!path->nodes[1])
+                return 1;
+        upper = path->nodes[1];
+        if (slot >= btrfs_header_nritems(upper) - 1)
+                return 1;
+        WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+        right = read_node_slot(root, upper, slot + 1);
+        btrfs_tree_lock(right);
+        free_space = btrfs_leaf_free_space(root, right);
+        if (free_space < data_size)
+                goto out_unlock;
+        /* cow and double check */
+        ret = btrfs_cow_block(trans, root, right, upper,
+                              slot + 1, &right, 0);
+        if (ret)
+                goto out_unlock;
+        free_space = btrfs_leaf_free_space(root, right);
+        if (free_space < data_size)
+                goto out_unlock;
+        left_nritems = btrfs_header_nritems(left);
+        if (left_nritems == 0)
+                goto out_unlock;
+        if (empty)
+                nr = 0;
+        else
+                nr = 1;
+        if (path->slots[0] >= left_nritems)
+                push_space += data_size;
+        i = left_nritems - 1;
+        while (i >= nr) {
+                item = btrfs_item_nr(left, i);
+                if (!empty && push_items > 0) {
+                        if (path->slots[0] > i)
+                                break;
+                        if (path->slots[0] == i) {
+                                int space = btrfs_leaf_free_space(root, left);
+                                if (space + push_space * 2 > free_space)
+                                        break;
+                        }
+                }
+                if (path->slots[0] == i)
+                        push_space += data_size;
+                if (!left->map_token) {
+                        map_extent_buffer(left, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &left->map_token, &left->kaddr,
+                                        &left->map_start, &left->map_len,
+                                        KM_USER1);
+                }
+                this_item_size = btrfs_item_size(left, item);
+                if (this_item_size + sizeof(*item) + push_space > free_space)
+                        break;
+                push_items++;
+                push_space += this_item_size + sizeof(*item);
+                if (i == 0)
+                        break;
+                i--;
+        }
+        if (left->map_token) {
+                unmap_extent_buffer(left, left->map_token, KM_USER1);
+                left->map_token = NULL;
+        }
+        if (push_items == 0)
+                goto out_unlock;
+        if (!empty && push_items == left_nritems)
+                WARN_ON(1);
+        /* push left to right */
+        right_nritems = btrfs_header_nritems(right);
+        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+        push_space -= leaf_data_end(root, left);
+        /* make room in the right data area */
+        data_end = leaf_data_end(root, right);
+        memmove_extent_buffer(right,
+                              btrfs_leaf_data(right) + data_end - push_space,
+                              btrfs_leaf_data(right) + data_end,
+                              BTRFS_LEAF_DATA_SIZE(root) - data_end);
+        /* copy from the left data area */
+        copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+                     BTRFS_LEAF_DATA_SIZE(root) - push_space,
+                     btrfs_leaf_data(left) + leaf_data_end(root, left),
+                     push_space);
+        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+                              btrfs_item_nr_offset(0),
+                              right_nritems * sizeof(struct btrfs_item));
+        /* copy the items from left to right */
+        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+                   btrfs_item_nr_offset(left_nritems - push_items),
+                   push_items * sizeof(struct btrfs_item));
+        /* update the item pointers */
+        right_nritems += push_items;
+        btrfs_set_header_nritems(right, right_nritems);
+        push_space = BTRFS_LEAF_DATA_SIZE(root);
+        for (i = 0; i < right_nritems; i++) {
+                item = btrfs_item_nr(right, i);
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                push_space -= btrfs_item_size(right, item);
+                btrfs_set_item_offset(right, item, push_space);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        left_nritems -= push_items;
+        btrfs_set_header_nritems(left, left_nritems);
+        if (left_nritems)
+                btrfs_mark_buffer_dirty(left);
+        btrfs_mark_buffer_dirty(right);
+        ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
+        BUG_ON(ret);
+        btrfs_item_key(right, &disk_key, 0);
+        btrfs_set_node_key(upper, &disk_key, slot + 1);
+        btrfs_mark_buffer_dirty(upper);
+        /* then fixup the leaf pointer in the path */
+        if (path->slots[0] >= left_nritems) {
+                path->slots[0] -= left_nritems;
+                if (btrfs_header_nritems(path->nodes[0]) == 0)
+                        clean_tree_block(trans, root, path->nodes[0]);
+                btrfs_tree_unlock(path->nodes[0]);
+                free_extent_buffer(path->nodes[0]);
+                path->nodes[0] = right;
+                path->slots[1] += 1;
+        } else {
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        return 0;
+out_unlock:
+        btrfs_tree_unlock(right);
+        free_extent_buffer(right);
+        return 1;
+}
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, struct btrfs_path *path, int data_size,
+                          int empty)
+{
+        struct btrfs_disk_key disk_key;
+        struct extent_buffer *right = path->nodes[0];
+        struct extent_buffer *left;
+        int slot;
+        int i;
+        int free_space;
+        int push_space = 0;
+        int push_items = 0;
+        struct btrfs_item *item;
+        u32 old_left_nritems;
+        u32 right_nritems;
+        u32 nr;
+        int ret = 0;
+        int wret;
+        u32 this_item_size;
+        u32 old_left_item_size;
+        slot = path->slots[1];
+        if (slot == 0)
+                return 1;
+        if (!path->nodes[1])
+                return 1;
+        right_nritems = btrfs_header_nritems(right);
+        if (right_nritems == 0)
+                return 1;
+        WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+        left = read_node_slot(root, path->nodes[1], slot - 1);
+        btrfs_tree_lock(left);
+        free_space = btrfs_leaf_free_space(root, left);
+        if (free_space < data_size) {
+                ret = 1;
+                goto out;
+        }
+        /* cow and double check */
+        ret = btrfs_cow_block(trans, root, left,
+                              path->nodes[1], slot - 1, &left, 0);
+        if (ret) {
+                /* we hit -ENOSPC, but it isn't fatal here */
+                ret = 1;
+                goto out;
+        }
+        free_space = btrfs_leaf_free_space(root, left);
+        if (free_space < data_size) {
+                ret = 1;
+                goto out;
+        }
+        if (empty)
+                nr = right_nritems;
+        else
+                nr = right_nritems - 1;
+        for (i = 0; i < nr; i++) {
+                item = btrfs_item_nr(right, i);
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                if (!empty && push_items > 0) {
+                        if (path->slots[0] < i)
+                                break;
+                        if (path->slots[0] == i) {
+                                int space = btrfs_leaf_free_space(root, right);
+                                if (space + push_space * 2 > free_space)
+                                        break;
+                        }
+                }
+                if (path->slots[0] == i)
+                        push_space += data_size;
+                this_item_size = btrfs_item_size(right, item);
+                if (this_item_size + sizeof(*item) + push_space > free_space)
+                        break;
+                push_items++;
+                push_space += this_item_size + sizeof(*item);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        if (push_items == 0) {
+                ret = 1;
+                goto out;
+        }
+        if (!empty && push_items == btrfs_header_nritems(right))
+                WARN_ON(1);
+        /* push data from right to left */
+        copy_extent_buffer(left, right,
+                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
+                           btrfs_item_nr_offset(0),
+                           push_items * sizeof(struct btrfs_item));
+        push_space = BTRFS_LEAF_DATA_SIZE(root) -
+                     btrfs_item_offset_nr(right, push_items - 1);
+        copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+                     leaf_data_end(root, left) - push_space,
+                     btrfs_leaf_data(right) +
+                     btrfs_item_offset_nr(right, push_items - 1),
+                     push_space);
+        old_left_nritems = btrfs_header_nritems(left);
+        BUG_ON(old_left_nritems <= 0);
+        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+                u32 ioff;
+                item = btrfs_item_nr(left, i);
+                if (!left->map_token) {
+                        map_extent_buffer(left, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &left->map_token, &left->kaddr,
+                                        &left->map_start, &left->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(left, item);
+                btrfs_set_item_offset(left, item,
+                      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+        }
+        btrfs_set_header_nritems(left, old_left_nritems + push_items);
+        if (left->map_token) {
+                unmap_extent_buffer(left, left->map_token, KM_USER1);
+                left->map_token = NULL;
+        }
+        /* fixup right node */
+        if (push_items > right_nritems) {
+                printk(KERN_CRIT "push items %d nr %u\n", push_items,
+                       right_nritems);
+                WARN_ON(1);
+        }
+        if (push_items < right_nritems) {
+                push_space = btrfs_item_offset_nr(right, push_items - 1) -
+                                                  leaf_data_end(root, right);
+                memmove_extent_buffer(right, btrfs_leaf_data(right) +
+                                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
+                                      btrfs_leaf_data(right) +
+                                      leaf_data_end(root, right), push_space);
+                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+                              btrfs_item_nr_offset(push_items),
+                             (btrfs_header_nritems(right) - push_items) *
+                             sizeof(struct btrfs_item));
+        }
+        right_nritems -= push_items;
+        btrfs_set_header_nritems(right, right_nritems);
+        push_space = BTRFS_LEAF_DATA_SIZE(root);
+        for (i = 0; i < right_nritems; i++) {
+                item = btrfs_item_nr(right, i);
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                push_space = push_space - btrfs_item_size(right, item);
+                btrfs_set_item_offset(right, item, push_space);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        btrfs_mark_buffer_dirty(left);
+        if (right_nritems)
+                btrfs_mark_buffer_dirty(right);
+        ret = btrfs_update_ref(trans, root, right, left,
+                               old_left_nritems, push_items);
+        BUG_ON(ret);
+        btrfs_item_key(right, &disk_key, 0);
+        wret = fixup_low_keys(trans, root, path, &disk_key, 1);
+        if (wret)
+                ret = wret;
+        /* then fixup the leaf pointer in the path */
+        if (path->slots[0] < push_items) {
+                path->slots[0] += old_left_nritems;
+                if (btrfs_header_nritems(path->nodes[0]) == 0)
+                        clean_tree_block(trans, root, path->nodes[0]);
+                btrfs_tree_unlock(path->nodes[0]);
+                free_extent_buffer(path->nodes[0]);
+                path->nodes[0] = left;
+                path->slots[1] -= 1;
+        } else {
+                btrfs_tree_unlock(left);
+                free_extent_buffer(left);
+                path->slots[0] -= push_items;
+        }
+        BUG_ON(path->slots[0] < 0);
+        return ret;
+out:
+        btrfs_tree_unlock(left);
+        free_extent_buffer(left);
+        return ret;
+}
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_key *ins_key,
+                               struct btrfs_path *path, int data_size,
+                               int extend)
+{
+        struct extent_buffer *l;
+        u32 nritems;
+        int mid;
+        int slot;
+        struct extent_buffer *right;
+        int data_copy_size;
+        int rt_data_off;
+        int i;
+        int ret = 0;
+        int wret;
+        int double_split;
+        int num_doubles = 0;
+        struct btrfs_disk_key disk_key;
+        /* first try to make some room by pushing left and right */
+        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+                wret = push_leaf_right(trans, root, path, data_size, 0);
+                if (wret < 0)
+                        return wret;
+                if (wret) {
+                        wret = push_leaf_left(trans, root, path, data_size, 0);
+                        if (wret < 0)
+                                return wret;
+                }
+                l = path->nodes[0];
+                /* did the pushes work? */
+                if (btrfs_leaf_free_space(root, l) >= data_size)
+                        return 0;
+        }
+        if (!path->nodes[1]) {
+                ret = insert_new_root(trans, root, path, 1);
+                if (ret)
+                        return ret;
+        }
+again:
+        double_split = 0;
+        l = path->nodes[0];
+        slot = path->slots[0];
+        nritems = btrfs_header_nritems(l);
+        mid = (nritems + 1) / 2;
+        right = btrfs_alloc_free_block(trans, root, root->leafsize,
+                                        path->nodes[1]->start,
+                                        root->root_key.objectid,
+                                        trans->transid, 0, l->start, 0);
+        if (IS_ERR(right)) {
+                BUG_ON(1);
+                return PTR_ERR(right);
+        }
+        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+        btrfs_set_header_bytenr(right, right->start);
+        btrfs_set_header_generation(right, trans->transid);
+        btrfs_set_header_owner(right, root->root_key.objectid);
+        btrfs_set_header_level(right, 0);
+        write_extent_buffer(right, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(right),
+                            BTRFS_FSID_SIZE);
+        write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
+                            BTRFS_UUID_SIZE);
+        if (mid <= slot) {
+                if (nritems == 1 ||
+                    leaf_space_used(l, mid, nritems - mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (slot >= nritems) {
+                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+                                btrfs_set_header_nritems(right, 0);
+                                wret = insert_ptr(trans, root, path,
+                                                  &disk_key, right->start,
+                                                  path->slots[1] + 1, 1);
+                                if (wret)
+                                        ret = wret;
+                                btrfs_tree_unlock(path->nodes[0]);
+                                free_extent_buffer(path->nodes[0]);
+                                path->nodes[0] = right;
+                                path->slots[0] = 0;
+                                path->slots[1] += 1;
+                                btrfs_mark_buffer_dirty(right);
+                                return ret;
+                        }
+                        mid = slot;
+                        if (mid != nritems &&
+                            leaf_space_used(l, mid, nritems - mid) +
+                            data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                double_split = 1;
+                        }
+                }
+        } else {
+                if (leaf_space_used(l, 0, mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (!extend && data_size && slot == 0) {
+                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+                                btrfs_set_header_nritems(right, 0);
+                                wret = insert_ptr(trans, root, path,
+                                                  &disk_key,
+                                                  right->start,
+                                                  path->slots[1], 1);
+                                if (wret)
+                                        ret = wret;
+                                btrfs_tree_unlock(path->nodes[0]);
+                                free_extent_buffer(path->nodes[0]);
+                                path->nodes[0] = right;
+                                path->slots[0] = 0;
+                                if (path->slots[1] == 0) {
+                                        wret = fixup_low_keys(trans, root,
+                                                      path, &disk_key, 1);
+                                        if (wret)
+                                                ret = wret;
+                                }
+                                btrfs_mark_buffer_dirty(right);
+                                return ret;
+                        } else if ((extend || !data_size) && slot == 0) {
+                                mid = 1;
+                        } else {
+                                mid = slot;
+                                if (mid != nritems &&
+                                    leaf_space_used(l, mid, nritems - mid) +
+                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        double_split = 1;
+                                }
+                        }
+                }
+        }
+        nritems = nritems - mid;
+        btrfs_set_header_nritems(right, nritems);
+        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+                           btrfs_item_nr_offset(mid),
+                           nritems * sizeof(struct btrfs_item));
+        copy_extent_buffer(right, l,
+                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+                     data_copy_size, btrfs_leaf_data(l) +
+                     leaf_data_end(root, l), data_copy_size);
+        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+                      btrfs_item_end_nr(l, mid);
+        for (i = 0; i < nritems; i++) {
+                struct btrfs_item *item = btrfs_item_nr(right, i);
+                u32 ioff;
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(right, item);
+                btrfs_set_item_offset(right, item, ioff + rt_data_off);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        btrfs_set_header_nritems(l, mid);
+        ret = 0;
+        btrfs_item_key(right, &disk_key, 0);
+        wret = insert_ptr(trans, root, path, &disk_key, right->start,
+                          path->slots[1] + 1, 1);
+        if (wret)
+                ret = wret;
+        btrfs_mark_buffer_dirty(right);
+        btrfs_mark_buffer_dirty(l);
+        BUG_ON(path->slots[0] != slot);
+        ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+        BUG_ON(ret);
+        if (mid <= slot) {
+                btrfs_tree_unlock(path->nodes[0]);
+                free_extent_buffer(path->nodes[0]);
+                path->nodes[0] = right;
+                path->slots[0] -= mid;
+                path->slots[1] += 1;
+        } else {
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        BUG_ON(path->slots[0] < 0);
+        if (double_split) {
+                BUG_ON(num_doubles != 0);
+                num_doubles++;
+                goto again;
+        }
+        return ret;
+}
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation.  After
+ * the split, the path is pointing to the old item.  The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_path *path,
+                     struct btrfs_key *new_key,
+                     unsigned long split_offset)
+{
+        u32 item_size;
+        struct extent_buffer *leaf;
+        struct btrfs_key orig_key;
+        struct btrfs_item *item;
+        struct btrfs_item *new_item;
+        int ret = 0;
+        int slot;
+        u32 nritems;
+        u32 orig_offset;
+        struct btrfs_disk_key disk_key;
+        char *buf;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
+        if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
+                goto split;
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        btrfs_release_path(root, path);
+        path->search_for_split = 1;
+        path->keep_locks = 1;
+        ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
+        path->search_for_split = 0;
+        /* if our item isn't there or got smaller, return now */
+        if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
+                                                        path->slots[0])) {
+                path->keep_locks = 0;
+                return -EAGAIN;
+        }
+        ret = split_leaf(trans, root, &orig_key, path,
+                         sizeof(struct btrfs_item), 1);
+        path->keep_locks = 0;
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+split:
+        item = btrfs_item_nr(leaf, path->slots[0]);
+        orig_offset = btrfs_item_offset(leaf, item);
+        item_size = btrfs_item_size(leaf, item);
+        buf = kmalloc(item_size, GFP_NOFS);
+        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+                            path->slots[0]), item_size);
+        slot = path->slots[0] + 1;
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        if (slot != nritems) {
+                /* shift the items */
+                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+                              btrfs_item_nr_offset(slot),
+                              (nritems - slot) * sizeof(struct btrfs_item));
+        }
+        btrfs_cpu_key_to_disk(&disk_key, new_key);
+        btrfs_set_item_key(leaf, &disk_key, slot);
+        new_item = btrfs_item_nr(leaf, slot);
+        btrfs_set_item_offset(leaf, new_item, orig_offset);
+        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+        btrfs_set_item_offset(leaf, item,
+                              orig_offset + item_size - split_offset);
+        btrfs_set_item_size(leaf, item, split_offset);
+        btrfs_set_header_nritems(leaf, nritems + 1);
+        /* write the data for the start of the original item */
+        write_extent_buffer(leaf, buf,
+                            btrfs_item_ptr_offset(leaf, path->slots[0]),
+                            split_offset);
+        /* write the data for the new item */
+        write_extent_buffer(leaf, buf + split_offset,
+                            btrfs_item_ptr_offset(leaf, slot),
+                            item_size - split_offset);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+        kfree(buf);
+        return ret;
+}
+/*
+ * make the item pointed to by the path smaller.  new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        u32 new_size, int from_end)
+{
+        int ret = 0;
+        int slot;
+        int slot_orig;
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        u32 nritems;
+        unsigned int data_end;
+        unsigned int old_data_start;
+        unsigned int old_size;
+        unsigned int size_diff;
+        int i;
+        slot_orig = path->slots[0];
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        old_size = btrfs_item_size_nr(leaf, slot);
+        if (old_size == new_size)
+                return 0;
+        nritems = btrfs_header_nritems(leaf);
+        data_end = leaf_data_end(root, leaf);
+        old_data_start = btrfs_item_offset_nr(leaf, slot);
+        size_diff = old_size - new_size;
+        BUG_ON(slot < 0);
+        BUG_ON(slot >= nritems);
+        /*
+         * item0..itemN ... dataN.offset..dataN.size .. data0.size
+         */
+        /* first correct the data pointers */
+        for (i = slot; i < nritems; i++) {
+                u32 ioff;
+                item = btrfs_item_nr(leaf, i);
+                if (!leaf->map_token) {
+                        map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(leaf, item);
+                btrfs_set_item_offset(leaf, item, ioff + size_diff);
+        }
+        if (leaf->map_token) {
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
+        /* shift the data */
+        if (from_end) {
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end + size_diff, btrfs_leaf_data(leaf) +
+                              data_end, old_data_start + new_size - data_end);
+        } else {
+                struct btrfs_disk_key disk_key;
+                u64 offset;
+                btrfs_item_key(leaf, &disk_key, slot);
+                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+                        unsigned long ptr;
+                        struct btrfs_file_extent_item *fi;
+                        fi = btrfs_item_ptr(leaf, slot,
+                                            struct btrfs_file_extent_item);
+                        fi = (struct btrfs_file_extent_item *)(
+                             (unsigned long)fi - size_diff);
+                        if (btrfs_file_extent_type(leaf, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE) {
+                                ptr = btrfs_item_ptr_offset(leaf, slot);
+                                memmove_extent_buffer(leaf, ptr,
+                                      (unsigned long)fi,
+                                      offsetof(struct btrfs_file_extent_item,
+                                                 disk_bytenr));
+                        }
+                }
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end + size_diff, btrfs_leaf_data(leaf) +
+                              data_end, old_data_start - data_end);
+                offset = btrfs_disk_key_offset(&disk_key);
+                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+                btrfs_set_item_key(leaf, &disk_key, slot);
+                if (slot == 0)
+                        fixup_low_keys(trans, root, path, &disk_key, 1);
+        }
+        item = btrfs_item_nr(leaf, slot);
+        btrfs_set_item_size(leaf, item, new_size);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+        return ret;
+}
+/*
+ * make the item pointed to by the path bigger, data_size is the new size.
+ */
+int btrfs_extend_item(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct btrfs_path *path,
+                      u32 data_size)
+{
+        int ret = 0;
+        int slot;
+        int slot_orig;
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        u32 nritems;
+        unsigned int data_end;
+        unsigned int old_data;
+        unsigned int old_size;
+        int i;
+        slot_orig = path->slots[0];
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        data_end = leaf_data_end(root, leaf);
+        if (btrfs_leaf_free_space(root, leaf) < data_size) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+        slot = path->slots[0];
+        old_data = btrfs_item_end_nr(leaf, slot);
+        BUG_ON(slot < 0);
+        if (slot >= nritems) {
+                btrfs_print_leaf(root, leaf);
+                printk(KERN_CRIT "slot %d too large, nritems %d\n",
+                       slot, nritems);
+                BUG_ON(1);
+        }
+        /*
+         * item0..itemN ... dataN.offset..dataN.size .. data0.size
+         */
+        /* first correct the data pointers */
+        for (i = slot; i < nritems; i++) {
+                u32 ioff;
+                item = btrfs_item_nr(leaf, i);
+                if (!leaf->map_token) {
+                        map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(leaf, item);
+                btrfs_set_item_offset(leaf, item, ioff - data_size);
+        }
+        if (leaf->map_token) {
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
+        /* shift the data */
+        memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                      data_end - data_size, btrfs_leaf_data(leaf) +
+                      data_end, old_data - data_end);
+        data_end = old_data;
+        old_size = btrfs_item_size_nr(leaf, slot);
+        item = btrfs_item_nr(leaf, slot);
+        btrfs_set_item_size(leaf, item, old_size + data_size);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+        return ret;
+}
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ * Returns the number of keys that were inserted.
+ */
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct btrfs_key *cpu_key, u32 *data_size,
+                            int nr)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        int ret = 0;
+        int slot;
+        int i;
+        u32 nritems;
+        u32 total_data = 0;
+        u32 total_size = 0;
+        unsigned int data_end;
+        struct btrfs_disk_key disk_key;
+        struct btrfs_key found_key;
+        for (i = 0; i < nr; i++) {
+                if (total_size + data_size[i] + sizeof(struct btrfs_item) >
+                    BTRFS_LEAF_DATA_SIZE(root)) {
+                        break;
+                        nr = i;
+                }
+                total_data += data_size[i];
+                total_size += data_size[i] + sizeof(struct btrfs_item);
+        }
+        BUG_ON(nr == 0);
+        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+        if (ret == 0)
+                return -EEXIST;
+        if (ret < 0)
+                goto out;
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        data_end = leaf_data_end(root, leaf);
+        if (btrfs_leaf_free_space(root, leaf) < total_size) {
+                for (i = nr; i >= 0; i--) {
+                        total_data -= data_size[i];
+                        total_size -= data_size[i] + sizeof(struct btrfs_item);
+                        if (total_size < btrfs_leaf_free_space(root, leaf))
+                                break;
+                }
+                nr = i;
+        }
+        slot = path->slots[0];
+        BUG_ON(slot < 0);
+        if (slot != nritems) {
+                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+                item = btrfs_item_nr(leaf, slot);
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                /* figure out how many keys we can insert in here */
+                total_data = data_size[0];
+                for (i = 1; i < nr; i++) {
+                        if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+                                break;
+                        total_data += data_size[i];
+                }
+                nr = i;
+                if (old_data < data_end) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+                               slot, old_data, data_end);
+                        BUG_ON(1);
+                }
+                /*
+                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+                 */
+                /* first correct the data pointers */
+                WARN_ON(leaf->map_token);
+                for (i = slot; i < nritems; i++) {
+                        u32 ioff;
+                        item = btrfs_item_nr(leaf, i);
+                        if (!leaf->map_token) {
+                                map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                        }
+                        ioff = btrfs_item_offset(leaf, item);
+                        btrfs_set_item_offset(leaf, item, ioff - total_data);
+                }
+                if (leaf->map_token) {
+                        unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                        leaf->map_token = NULL;
+                }
+                /* shift the items */
+                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+                              btrfs_item_nr_offset(slot),
+                              (nritems - slot) * sizeof(struct btrfs_item));
+                /* shift the data */
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end - total_data, btrfs_leaf_data(leaf) +
+                              data_end, old_data - data_end);
+                data_end = old_data;
+        } else {
+                /*
+                 * this sucks but it has to be done, if we are inserting at
+                 * the end of the leaf only insert 1 of the items, since we
+                 * have no way of knowing whats on the next leaf and we'd have
+                 * to drop our current locks to figure it out
+                 */
+                nr = 1;
+        }
+        /* setup the item for the new data */
+        for (i = 0; i < nr; i++) {
+                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+                btrfs_set_item_key(leaf, &disk_key, slot + i);
+                item = btrfs_item_nr(leaf, slot + i);
+                btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+                data_end -= data_size[i];
+                btrfs_set_item_size(leaf, item, data_size[i]);
+        }
+        btrfs_set_header_nritems(leaf, nritems + nr);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (slot == 0) {
+                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+                ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+        }
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+out:
+        if (!ret)
+                ret = nr;
+        return ret;
+}
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct btrfs_key *cpu_key, u32 *data_size,
+                            int nr)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        int ret = 0;
+        int slot;
+        int slot_orig;
+        int i;
+        u32 nritems;
+        u32 total_size = 0;
+        u32 total_data = 0;
+        unsigned int data_end;
+        struct btrfs_disk_key disk_key;
+        for (i = 0; i < nr; i++)
+                total_data += data_size[i];
+        total_size = total_data + (nr * sizeof(struct btrfs_item));
+        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+        if (ret == 0)
+                return -EEXIST;
+        if (ret < 0)
+                goto out;
+        slot_orig = path->slots[0];
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        data_end = leaf_data_end(root, leaf);
+        if (btrfs_leaf_free_space(root, leaf) < total_size) {
+                btrfs_print_leaf(root, leaf);
+                printk(KERN_CRIT "not enough freespace need %u have %d\n",
+                       total_size, btrfs_leaf_free_space(root, leaf));
+                BUG();
+        }
+        slot = path->slots[0];
+        BUG_ON(slot < 0);
+        if (slot != nritems) {
+                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+                if (old_data < data_end) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+                               slot, old_data, data_end);
+                        BUG_ON(1);
+                }
+                /*
+                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+                 */
+                /* first correct the data pointers */
+                WARN_ON(leaf->map_token);
+                for (i = slot; i < nritems; i++) {
+                        u32 ioff;
+                        item = btrfs_item_nr(leaf, i);
+                        if (!leaf->map_token) {
+                                map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                        }
+                        ioff = btrfs_item_offset(leaf, item);
+                        btrfs_set_item_offset(leaf, item, ioff - total_data);
+                }
+                if (leaf->map_token) {
+                        unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                        leaf->map_token = NULL;
+                }
+                /* shift the items */
+                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+                              btrfs_item_nr_offset(slot),
+                              (nritems - slot) * sizeof(struct btrfs_item));
+                /* shift the data */
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end - total_data, btrfs_leaf_data(leaf) +
+                              data_end, old_data - data_end);
+                data_end = old_data;
+        }
+        /* setup the item for the new data */
+        for (i = 0; i < nr; i++) {
+                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+                btrfs_set_item_key(leaf, &disk_key, slot + i);
+                item = btrfs_item_nr(leaf, slot + i);
+                btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+                data_end -= data_size[i];
+                btrfs_set_item_size(leaf, item, data_size[i]);
+        }
+        btrfs_set_header_nritems(leaf, nritems + nr);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (slot == 0) {
+                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+                ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+        }
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+out:
+        return ret;
+}
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *cpu_key, void *data, u32
+                      data_size)
+{
+        int ret = 0;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        unsigned long ptr;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+        if (!ret) {
+                leaf = path->nodes[0];
+                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                write_extent_buffer(leaf, data, ptr, data_size);
+                btrfs_mark_buffer_dirty(leaf);
+        }
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * delete the pointer from a given node.
+ *
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
+ */
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_path *path, int level, int slot)
+{
+        struct extent_buffer *parent = path->nodes[level];
+        u32 nritems;
+        int ret = 0;
+        int wret;
+        nritems = btrfs_header_nritems(parent);
+        if (slot != nritems - 1) {
+                memmove_extent_buffer(parent,
+                              btrfs_node_key_ptr_offset(slot),
+                              btrfs_node_key_ptr_offset(slot + 1),
+                              sizeof(struct btrfs_key_ptr) *
+                              (nritems - slot - 1));
+        }
+        nritems--;
+        btrfs_set_header_nritems(parent, nritems);
+        if (nritems == 0 && parent == root->node) {
+                BUG_ON(btrfs_header_level(root->node) != 1);
+                /* just turn the root into a leaf and break */
+                btrfs_set_header_level(root->node, 0);
+        } else if (slot == 0) {
+                struct btrfs_disk_key disk_key;
+                btrfs_node_key(parent, &disk_key, 0);
+                wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
+                if (wret)
+                        ret = wret;
+        }
+        btrfs_mark_buffer_dirty(parent);
+        return ret;
+}
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].  bytenr is the node block pointer, but since the callers
+ * already know it, it is faster to have them pass it down than to
+ * read it out of the node again.
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent.  zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing.  path->nodes[1] must be locked.
+ */
+noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path, u64 bytenr)
+{
+        int ret;
+        u64 root_gen = btrfs_header_generation(path->nodes[1]);
+        ret = del_ptr(trans, root, path, 1, path->slots[1]);
+        if (ret)
+                return ret;
+        ret = btrfs_free_extent(trans, root, bytenr,
+                                btrfs_level_size(root, 0),
+                                path->nodes[1]->start,
+                                btrfs_header_owner(path->nodes[1]),
+                                root_gen, 0, 1);
+        return ret;
+}
+/*
+ * delete the item at the leaf level in path.  If that empties
+ * the leaf, remove it from the tree
+ */
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                    struct btrfs_path *path, int slot, int nr)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        int last_off;
+        int dsize = 0;
+        int ret = 0;
+        int wret;
+        int i;
+        u32 nritems;
+        leaf = path->nodes[0];
+        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+        for (i = 0; i < nr; i++)
+                dsize += btrfs_item_size_nr(leaf, slot + i);
+        nritems = btrfs_header_nritems(leaf);
+        if (slot + nr != nritems) {
+                int data_end = leaf_data_end(root, leaf);
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end + dsize,
+                              btrfs_leaf_data(leaf) + data_end,
+                              last_off - data_end);
+                for (i = slot + nr; i < nritems; i++) {
+                        u32 ioff;
+                        item = btrfs_item_nr(leaf, i);
+                        if (!leaf->map_token) {
+                                map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                        }
+                        ioff = btrfs_item_offset(leaf, item);
+                        btrfs_set_item_offset(leaf, item, ioff + dsize);
+                }
+                if (leaf->map_token) {
+                        unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                        leaf->map_token = NULL;
+                }
+                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+                              btrfs_item_nr_offset(slot + nr),
+                              sizeof(struct btrfs_item) *
+                              (nritems - slot - nr));
+        }
+        btrfs_set_header_nritems(leaf, nritems - nr);
+        nritems -= nr;
+        /* delete the leaf if we've emptied it */
+        if (nritems == 0) {
+                if (leaf == root->node) {
+                        btrfs_set_header_level(leaf, 0);
+                } else {
+                        ret = btrfs_del_leaf(trans, root, path, leaf->start);
+                        BUG_ON(ret);
+                }
+        } else {
+                int used = leaf_space_used(leaf, 0, nritems);
+                if (slot == 0) {
+                        struct btrfs_disk_key disk_key;
+                        btrfs_item_key(leaf, &disk_key, 0);
+                        wret = fixup_low_keys(trans, root, path,
+                                              &disk_key, 1);
+                        if (wret)
+                                ret = wret;
+                }
+                /* delete the leaf if it is mostly empty */
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+                        /* push_leaf_left fixes the path.
+                         * make sure the path still points to our leaf
+                         * for possible call to del_ptr below
+                         */
+                        slot = path->slots[1];
+                        extent_buffer_get(leaf);
+                        wret = push_leaf_left(trans, root, path, 1, 1);
+                        if (wret < 0 && wret != -ENOSPC)
+                                ret = wret;
+                        if (path->nodes[0] == leaf &&
+                            btrfs_header_nritems(leaf)) {
+                                wret = push_leaf_right(trans, root, path, 1, 1);
+                                if (wret < 0 && wret != -ENOSPC)
+                                        ret = wret;
+                        }
+                        if (btrfs_header_nritems(leaf) == 0) {
+                                path->slots[1] = slot;
+                                ret = btrfs_del_leaf(trans, root, path,
+                                                     leaf->start);
+                                BUG_ON(ret);
+                                free_extent_buffer(leaf);
+                        } else {
+                                /* if we're still in the path, make sure
+                                 * we're dirty.  Otherwise, one of the
+                                 * push_leaf functions must have already
+                                 * dirtied this buffer
+                                 */
+                                if (path->nodes[0] == leaf)
+                                        btrfs_mark_buffer_dirty(leaf);
+                                free_extent_buffer(leaf);
+                        }
+                } else {
+                        btrfs_mark_buffer_dirty(leaf);
+                }
+        }
+        return ret;
+}
+/*
+ * search the tree again to find a leaf with lesser keys
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        struct btrfs_disk_key found_key;
+        int ret;
+        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+        if (key.offset > 0)
+                key.offset--;
+        else if (key.type > 0)
+                key.type--;
+        else if (key.objectid > 0)
+                key.objectid--;
+        else
+                return 1;
+        btrfs_release_path(root, path);
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                return ret;
+        btrfs_item_key(path->nodes[0], &found_key, 0);
+        ret = comp_keys(&found_key, &key);
+        if (ret < 0)
+                return 0;
+        return 1;
+}
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are either in cache or have a minimum
+ * transaction id.  This is used by the btree defrag code, and tree logging
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through.  Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+                         struct btrfs_key *max_key,
+                         struct btrfs_path *path, int cache_only,
+                         u64 min_trans)
+{
+        struct extent_buffer *cur;
+        struct btrfs_key found_key;
+        int slot;
+        int sret;
+        u32 nritems;
+        int level;
+        int ret = 1;
+        WARN_ON(!path->keep_locks);
+again:
+        cur = btrfs_lock_root_node(root);
+        level = btrfs_header_level(cur);
+        WARN_ON(path->nodes[level]);
+        path->nodes[level] = cur;
+        path->locks[level] = 1;
+        if (btrfs_header_generation(cur) < min_trans) {
+                ret = 1;
+                goto out;
+        }
+        while (1) {
+                nritems = btrfs_header_nritems(cur);
+                level = btrfs_header_level(cur);
+                sret = bin_search(cur, min_key, level, &slot);
+                /* at the lowest level, we're done, setup the path and exit */
+                if (level == path->lowest_level) {
+                        if (slot >= nritems)
+                                goto find_next_key;
+                        ret = 0;
+                        path->slots[level] = slot;
+                        btrfs_item_key_to_cpu(cur, &found_key, slot);
+                        goto out;
+                }
+                if (sret && slot > 0)
+                        slot--;
+                /*
+                 * check this node pointer against the cache_only and
+                 * min_trans parameters.  If it isn't in cache or is too
+                 * old, skip to the next one.
+                 */
+                while (slot < nritems) {
+                        u64 blockptr;
+                        u64 gen;
+                        struct extent_buffer *tmp;
+                        struct btrfs_disk_key disk_key;
+                        blockptr = btrfs_node_blockptr(cur, slot);
+                        gen = btrfs_node_ptr_generation(cur, slot);
+                        if (gen < min_trans) {
+                                slot++;
+                                continue;
+                        }
+                        if (!cache_only)
+                                break;
+                        if (max_key) {
+                                btrfs_node_key(cur, &disk_key, slot);
+                                if (comp_keys(&disk_key, max_key) >= 0) {
+                                        ret = 1;
+                                        goto out;
+                                }
+                        }
+                        tmp = btrfs_find_tree_block(root, blockptr,
+                                            btrfs_level_size(root, level - 1));
+                        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+                                free_extent_buffer(tmp);
+                                break;
+                        }
+                        if (tmp)
+                                free_extent_buffer(tmp);
+                        slot++;
+                }
+find_next_key:
+                /*
+                 * we didn't find a candidate key in this node, walk forward
+                 * and find another one
+                 */
+                if (slot >= nritems) {
+                        path->slots[level] = slot;
+                        sret = btrfs_find_next_key(root, path, min_key, level,
+                                                  cache_only, min_trans);
+                        if (sret == 0) {
+                                btrfs_release_path(root, path);
+                                goto again;
+                        } else {
+                                goto out;
+                        }
+                }
+                /* save our key for returning back */
+                btrfs_node_key_to_cpu(cur, &found_key, slot);
+                path->slots[level] = slot;
+                if (level == path->lowest_level) {
+                        ret = 0;
+                        unlock_up(path, level, 1);
+                        goto out;
+                }
+                cur = read_node_slot(root, cur, slot);
+                btrfs_tree_lock(cur);
+                path->locks[level - 1] = 1;
+                path->nodes[level - 1] = cur;
+                unlock_up(path, level, 1);
+        }
+out:
+        if (ret == 0)
+                memcpy(min_key, &found_key, sizeof(found_key));
+        return ret;
+}
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path.  It looks for and returns the next key in the
+ * tree based on the current path and the cache_only and min_trans
+ * parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+                        struct btrfs_key *key, int lowest_level,
+                        int cache_only, u64 min_trans)
+{
+        int level = lowest_level;
+        int slot;
+        struct extent_buffer *c;
+        WARN_ON(!path->keep_locks);
+        while (level < BTRFS_MAX_LEVEL) {
+                if (!path->nodes[level])
+                        return 1;
+                slot = path->slots[level] + 1;
+                c = path->nodes[level];
+next:
+                if (slot >= btrfs_header_nritems(c)) {
+                        level++;
+                        if (level == BTRFS_MAX_LEVEL)
+                                return 1;
+                        continue;
+                }
+                if (level == 0)
+                        btrfs_item_key_to_cpu(c, key, slot);
+                else {
+                        u64 blockptr = btrfs_node_blockptr(c, slot);
+                        u64 gen = btrfs_node_ptr_generation(c, slot);
+                        if (cache_only) {
+                                struct extent_buffer *cur;
+                                cur = btrfs_find_tree_block(root, blockptr,
+                                            btrfs_level_size(root, level - 1));
+                                if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
+                                        slot++;
+                                        if (cur)
+                                                free_extent_buffer(cur);
+                                        goto next;
+                                }
+                                free_extent_buffer(cur);
+                        }
+                        if (gen < min_trans) {
+                                slot++;
+                                goto next;
+                        }
+                        btrfs_node_key_to_cpu(c, key, slot);
+                }
+                return 0;
+        }
+        return 1;
+}
+/*
+ * search the tree again to find a leaf with greater keys
+ * returns 0 if it found something or 1 if there are no greater leaves.
+ * returns < 0 on io errors.
+ */
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+        int slot;
+        int level = 1;
+        struct extent_buffer *c;
+        struct extent_buffer *next = NULL;
+        struct btrfs_key key;
+        u32 nritems;
+        int ret;
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        if (nritems == 0)
+                return 1;
+        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+        btrfs_release_path(root, path);
+        path->keep_locks = 1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        path->keep_locks = 0;
+        if (ret < 0)
+                return ret;
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        /*
+         * by releasing the path above we dropped all our locks.  A balance
+         * could have added more items next to the key that used to be
+         * at the very end of the block.  So, check again here and
+         * advance the path if there are now more items available.
+         */
+        if (nritems > 0 && path->slots[0] < nritems - 1) {
+                path->slots[0]++;
+                goto done;
+        }
+        while (level < BTRFS_MAX_LEVEL) {
+                if (!path->nodes[level])
+                        return 1;
+                slot = path->slots[level] + 1;
+                c = path->nodes[level];
+                if (slot >= btrfs_header_nritems(c)) {
+                        level++;
+                        if (level == BTRFS_MAX_LEVEL)
+                                return 1;
+                        continue;
+                }
+                if (next) {
+                        btrfs_tree_unlock(next);
+                        free_extent_buffer(next);
+                }
+                if (level == 1 && (path->locks[1] || path->skip_locking) &&
+                    path->reada)
+                        reada_for_search(root, path, level, slot, 0);
+                next = read_node_slot(root, c, slot);
+                if (!path->skip_locking) {
+                        WARN_ON(!btrfs_tree_locked(c));
+                        btrfs_tree_lock(next);
+                }
+                break;
+        }
+        path->slots[level] = slot;
+        while (1) {
+                level--;
+                c = path->nodes[level];
+                if (path->locks[level])
+                        btrfs_tree_unlock(c);
+                free_extent_buffer(c);
+                path->nodes[level] = next;
+                path->slots[level] = 0;
+                if (!path->skip_locking)
+                        path->locks[level] = 1;
+                if (!level)
+                        break;
+                if (level == 1 && path->locks[1] && path->reada)
+                        reada_for_search(root, path, level, slot, 0);
+                next = read_node_slot(root, next, 0);
+                if (!path->skip_locking) {
+                        WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+                        btrfs_tree_lock(next);
+                }
+        }
+done:
+        unlock_up(path, 0, 1);
+        return 0;
+}
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_item(struct btrfs_root *root,
+                        struct btrfs_path *path, u64 min_objectid,
+                        int type)
+{
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        u32 nritems;
+        int ret;
+        while (1) {
+                if (path->slots[0] == 0) {
+                        ret = btrfs_prev_leaf(root, path);
+                        if (ret != 0)
+                                return ret;
+                } else {
+                        path->slots[0]--;
+                }
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (nritems == 0)
+                        return 1;
+                if (path->slots[0] == nritems)
+                        path->slots[0]--;
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.type == type)
+                        return 0;
+                if (found_key.objectid < min_objectid)
+                        break;
+                if (found_key.objectid == min_objectid &&
+                    found_key.type < type)
+                        break;
+        }
+        return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..eee060f88113
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2129 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <asm/kmap_types.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "async-thread.h"
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+extern struct kmem_cache *btrfs_bit_radix_cachep;
+extern struct kmem_cache *btrfs_path_cachep;
+struct btrfs_ordered_sum;
+#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_ACL_NOT_CACHED    ((void *)-1)
+#ifdef CONFIG_LOCKDEP
+# define BTRFS_MAX_LEVEL 7
+#else
+# define BTRFS_MAX_LEVEL 8
+#endif
+/* holds pointers to all of the tree roots */
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+/* stores information about which extents are in use, and reference counts */
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device.  The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+/*
+ * All files have objectids in this range.
+ */
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
+/*
+ * the device items go into the chunk tree.  The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32   0
+static int btrfs_csum_sizes[] = { 4, 0 };
+/* four bytes for CRC32 */
+#define BTRFS_EMPTY_DIR_SIZE 0
+#define BTRFS_FT_UNKNOWN        0
+#define BTRFS_FT_REG_FILE       1
+#define BTRFS_FT_DIR            2
+#define BTRFS_FT_CHRDEV         3
+#define BTRFS_FT_BLKDEV         4
+#define BTRFS_FT_FIFO           5
+#define BTRFS_FT_SOCK           6
+#define BTRFS_FT_SYMLINK        7
+#define BTRFS_FT_XATTR          8
+#define BTRFS_FT_MAX            9
+/*
+ * the key defines the order in the tree, and so it also defines (optimal)
+ * block layout.  objectid corresonds to the inode number.  The flags
+ * tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with flags of 1 might refer to the inode
+ * data, flags of 2 may point to file data in the btree and flags == 3
+ * may point to extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
+ * in cpu native order.  Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
+ */
+struct btrfs_disk_key {
+        __le64 objectid;
+        u8 type;
+        __le64 offset;
+} __attribute__ ((__packed__));
+struct btrfs_key {
+        u64 objectid;
+        u8 type;
+        u64 offset;
+} __attribute__ ((__packed__));
+struct btrfs_mapping_tree {
+        struct extent_map_tree map_tree;
+};
+#define BTRFS_UUID_SIZE 16
+struct btrfs_dev_item {
+        /* the internal btrfs device id */
+        __le64 devid;
+        /* size of the device */
+        __le64 total_bytes;
+        /* bytes used */
+        __le64 bytes_used;
+        /* optimal io alignment for this device */
+        __le32 io_align;
+        /* optimal io width for this device */
+        __le32 io_width;
+        /* minimal io size for this device */
+        __le32 sector_size;
+        /* type and info about this device */
+        __le64 type;
+        /* expected generation for this device */
+        __le64 generation;
+        /*
+         * starting byte of this partition on the device,
+         * to allowr for stripe alignment in the future
+         */
+        __le64 start_offset;
+        /* grouping information for allocation decisions */
+        __le32 dev_group;
+        /* seek speed 0-100 where 100 is fastest */
+        u8 seek_speed;
+        /* bandwidth 0-100 where 100 is fastest */
+        u8 bandwidth;
+        /* btrfs generated uuid for this device */
+        u8 uuid[BTRFS_UUID_SIZE];
+        /* uuid of FS who owns this device */
+        u8 fsid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+struct btrfs_stripe {
+        __le64 devid;
+        __le64 offset;
+        u8 dev_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+struct btrfs_chunk {
+        /* size of this chunk in bytes */
+        __le64 length;
+        /* objectid of the root referencing this chunk */
+        __le64 owner;
+        __le64 stripe_len;
+        __le64 type;
+        /* optimal io alignment for this chunk */
+        __le32 io_align;
+        /* optimal io width for this chunk */
+        __le32 io_width;
+        /* minimal io size for this chunk */
+        __le32 sector_size;
+        /* 2^16 stripes is quite a lot, a second limit is the size of a single
+         * item in the btree
+         */
+        __le16 num_stripes;
+        /* sub stripes only matter for raid10 */
+        __le16 sub_stripes;
+        struct btrfs_stripe stripe;
+        /* additional stripes go here */
+} __attribute__ ((__packed__));
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+        BUG_ON(num_stripes == 0);
+        return sizeof(struct btrfs_chunk) +
+                sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+#define BTRFS_FSID_SIZE 16
+#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+        /* these first four must match the super block */
+        u8 csum[BTRFS_CSUM_SIZE];
+        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+        __le64 bytenr; /* which block this node is supposed to live in */
+        __le64 flags;
+        /* allowed to be different from the super from here on down */
+        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+        __le64 generation;
+        __le64 owner;
+        __le32 nritems;
+        u8 level;
+} __attribute__ ((__packed__));
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
+                                      sizeof(struct btrfs_header)) / \
+                                     sizeof(struct btrfs_key_ptr))
+#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+                                        sizeof(struct btrfs_item) - \
+                                        sizeof(struct btrfs_file_extent_item))
+#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
+struct btrfs_super_block {
+        u8 csum[BTRFS_CSUM_SIZE];
+        /* the first 4 fields must match struct btrfs_header */
+        u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
+        __le64 bytenr; /* this block number */
+        __le64 flags;
+        /* allowed to be different from the btrfs_header from here own down */
+        __le64 magic;
+        __le64 generation;
+        __le64 root;
+        __le64 chunk_root;
+        __le64 log_root;
+        /* this will help find the new super based on the log root */
+        __le64 log_root_transid;
+        __le64 total_bytes;
+        __le64 bytes_used;
+        __le64 root_dir_objectid;
+        __le64 num_devices;
+        __le32 sectorsize;
+        __le32 nodesize;
+        __le32 leafsize;
+        __le32 stripesize;
+        __le32 sys_chunk_array_size;
+        __le64 chunk_root_generation;
+        __le64 compat_flags;
+        __le64 compat_ro_flags;
+        __le64 incompat_flags;
+        __le16 csum_type;
+        u8 root_level;
+        u8 chunk_root_level;
+        u8 log_root_level;
+        struct btrfs_dev_item dev_item;
+        char label[BTRFS_LABEL_SIZE];
+        /* future expansion */
+        __le64 reserved[32];
+        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+} __attribute__ ((__packed__));
+/*
+ * Compat flags that we support.  If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_COMPAT_SUPP       0x0
+#define BTRFS_FEATURE_COMPAT_RO_SUPP    0x0
+#define BTRFS_FEATURE_INCOMPAT_SUPP     0x0
+/*
+ * A leaf is full of items. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+        struct btrfs_disk_key key;
+        __le32 offset;
+        __le32 size;
+} __attribute__ ((__packed__));
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
+struct btrfs_leaf {
+        struct btrfs_header header;
+        struct btrfs_item items[];
+} __attribute__ ((__packed__));
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
+struct btrfs_key_ptr {
+        struct btrfs_disk_key key;
+        __le64 blockptr;
+        __le64 generation;
+} __attribute__ ((__packed__));
+struct btrfs_node {
+        struct btrfs_header header;
+        struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+/*
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
+struct btrfs_path {
+        struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
+        int slots[BTRFS_MAX_LEVEL];
+        /* if there is real range locking, this locks field will change */
+        int locks[BTRFS_MAX_LEVEL];
+        int reada;
+        /* keep some upper locks as we walk down */
+        int keep_locks;
+        int skip_locking;
+        int lowest_level;
+        /*
+         * set by btrfs_split_item, tells search_slot to keep all locks
+         * and to force calls to keep space in the nodes
+         */
+        int search_for_split;
+};
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+struct btrfs_extent_item {
+        __le32 refs;
+} __attribute__ ((__packed__));
+struct btrfs_extent_ref {
+        __le64 root;
+        __le64 generation;
+        __le64 objectid;
+        __le32 num_refs;
+} __attribute__ ((__packed__));
+/* dev extents record free space on individual devices.  The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent.  The chunk tree uuid field is a way to double check the owner
+ */
+struct btrfs_dev_extent {
+        __le64 chunk_tree;
+        __le64 chunk_objectid;
+        __le64 chunk_offset;
+        __le64 length;
+        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+struct btrfs_inode_ref {
+        __le64 index;
+        __le16 name_len;
+        /* name goes here */
+} __attribute__ ((__packed__));
+struct btrfs_timespec {
+        __le64 sec;
+        __le32 nsec;
+} __attribute__ ((__packed__));
+typedef enum {
+        BTRFS_COMPRESS_NONE = 0,
+        BTRFS_COMPRESS_ZLIB = 1,
+        BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+/* we don't understand any encryption methods right now */
+typedef enum {
+        BTRFS_ENCRYPTION_NONE = 0,
+        BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+struct btrfs_inode_item {
+        /* nfs style generation number */
+        __le64 generation;
+        /* transid that last touched this inode */
+        __le64 transid;
+        __le64 size;
+        __le64 nbytes;
+        __le64 block_group;
+        __le32 nlink;
+        __le32 uid;
+        __le32 gid;
+        __le32 mode;
+        __le64 rdev;
+        __le64 flags;
+        /* modification sequence number for NFS */
+        __le64 sequence;
+        /*
+         * a little future expansion, for more than this we can
+         * just grow the inode item and version it
+         */
+        __le64 reserved[4];
+        struct btrfs_timespec atime;
+        struct btrfs_timespec ctime;
+        struct btrfs_timespec mtime;
+        struct btrfs_timespec otime;
+} __attribute__ ((__packed__));
+struct btrfs_dir_log_item {
+        __le64 end;
+} __attribute__ ((__packed__));
+struct btrfs_dir_item {
+        struct btrfs_disk_key location;
+        __le64 transid;
+        __le16 data_len;
+        __le16 name_len;
+        u8 type;
+} __attribute__ ((__packed__));
+struct btrfs_root_item {
+        struct btrfs_inode_item inode;
+        __le64 generation;
+        __le64 root_dirid;
+        __le64 bytenr;
+        __le64 byte_limit;
+        __le64 bytes_used;
+        __le64 last_snapshot;
+        __le64 flags;
+        __le32 refs;
+        struct btrfs_disk_key drop_progress;
+        u8 drop_level;
+        u8 level;
+} __attribute__ ((__packed__));
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+        __le64 dirid;
+        __le64 sequence;
+        __le16 name_len;
+} __attribute__ ((__packed__));
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
+struct btrfs_file_extent_item {
+        /*
+         * transaction id that created this extent
+         */
+        __le64 generation;
+        /*
+         * max number of bytes to hold this extent in ram
+         * when we split a compressed extent we can't know how big
+         * each of the resulting pieces will be.  So, this is
+         * an upper limit on the size of the extent in ram instead of
+         * an exact limit.
+         */
+        __le64 ram_bytes;
+        /*
+         * 32 bits for the various ways we might encode the data,
+         * including compression and encryption.  If any of these
+         * are set to something a given disk format doesn't understand
+         * it is treated like an incompat flag for reading and writing,
+         * but not for stat.
+         */
+        u8 compression;
+        u8 encryption;
+        __le16 other_encoding; /* spare for later use */
+        /* are we inline data or a real extent? */
+        u8 type;
+        /*
+         * disk space consumed by the extent, checksum blocks are included
+         * in these numbers
+         */
+        __le64 disk_bytenr;
+        __le64 disk_num_bytes;
+        /*
+         * the logical offset in file blocks (no csums)
+         * this extent record is for.  This allows a file extent to point
+         * into the middle of an existing extent on disk, sharing it
+         * between two snapshots (useful if some bytes in the middle of the
+         * extent have changed
+         */
+        __le64 offset;
+        /*
+         * the logical number of file blocks (no csums included).  This
+         * always reflects the size uncompressed and without encoding.
+         */
+        __le64 num_bytes;
+} __attribute__ ((__packed__));
+struct btrfs_csum_item {
+        u8 csum;
+} __attribute__ ((__packed__));
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
+#define BTRFS_BLOCK_GROUP_DUP      (1 << 5)
+#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+struct btrfs_block_group_item {
+        __le64 used;
+        __le64 chunk_objectid;
+        __le64 flags;
+} __attribute__ ((__packed__));
+struct btrfs_space_info {
+        u64 flags;
+        u64 total_bytes;
+        u64 bytes_used;
+        u64 bytes_pinned;
+        u64 bytes_reserved;
+        u64 bytes_readonly;
+        int full;
+        int force_alloc;
+        struct list_head list;
+        /* for block groups in our same type */
+        struct list_head block_groups;
+        spinlock_t lock;
+        struct rw_semaphore groups_sem;
+};
+struct btrfs_free_space {
+        struct rb_node bytes_index;
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+};
+struct btrfs_block_group_cache {
+        struct btrfs_key key;
+        struct btrfs_block_group_item item;
+        spinlock_t lock;
+        struct mutex alloc_mutex;
+        struct mutex cache_mutex;
+        u64 pinned;
+        u64 reserved;
+        u64 flags;
+        int cached;
+        int ro;
+        int dirty;
+        struct btrfs_space_info *space_info;
+        /* free space cache stuff */
+        struct rb_root free_space_bytes;
+        struct rb_root free_space_offset;
+        /* block group cache stuff */
+        struct rb_node cache_node;
+        /* for block groups in the same raid type */
+        struct list_head list;
+        /* usage count */
+        atomic_t count;
+};
+struct btrfs_leaf_ref_tree {
+        struct rb_root root;
+        struct list_head list;
+        spinlock_t lock;
+};
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_fs_info {
+        u8 fsid[BTRFS_FSID_SIZE];
+        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+        struct btrfs_root *extent_root;
+        struct btrfs_root *tree_root;
+        struct btrfs_root *chunk_root;
+        struct btrfs_root *dev_root;
+        struct btrfs_root *fs_root;
+        struct btrfs_root *csum_root;
+        /* the log root tree is a directory of all the other log roots */
+        struct btrfs_root *log_root_tree;
+        struct radix_tree_root fs_roots_radix;
+        /* block group cache stuff */
+        spinlock_t block_group_cache_lock;
+        struct rb_root block_group_cache_tree;
+        struct extent_io_tree pinned_extents;
+        struct extent_io_tree pending_del;
+        struct extent_io_tree extent_ins;
+        /* logical->physical extent mapping */
+        struct btrfs_mapping_tree mapping_tree;
+        u64 generation;
+        u64 last_trans_committed;
+        u64 last_trans_new_blockgroup;
+        u64 open_ioctl_trans;
+        unsigned long mount_opt;
+        u64 max_extent;
+        u64 max_inline;
+        u64 alloc_start;
+        struct btrfs_transaction *running_transaction;
+        wait_queue_head_t transaction_throttle;
+        wait_queue_head_t transaction_wait;
+        wait_queue_head_t async_submit_wait;
+        wait_queue_head_t tree_log_wait;
+        struct btrfs_super_block super_copy;
+        struct btrfs_super_block super_for_commit;
+        struct block_device *__bdev;
+        struct super_block *sb;
+        struct inode *btree_inode;
+        struct backing_dev_info bdi;
+        spinlock_t hash_lock;
+        struct mutex trans_mutex;
+        struct mutex tree_log_mutex;
+        struct mutex transaction_kthread_mutex;
+        struct mutex cleaner_mutex;
+        struct mutex extent_ins_mutex;
+        struct mutex pinned_mutex;
+        struct mutex chunk_mutex;
+        struct mutex drop_mutex;
+        struct mutex volume_mutex;
+        struct mutex tree_reloc_mutex;
+        struct list_head trans_list;
+        struct list_head hashers;
+        struct list_head dead_roots;
+        atomic_t nr_async_submits;
+        atomic_t async_submit_draining;
+        atomic_t nr_async_bios;
+        atomic_t async_delalloc_pages;
+        atomic_t tree_log_writers;
+        atomic_t tree_log_commit;
+        unsigned long tree_log_batch;
+        u64 tree_log_transid;
+        /*
+         * this is used by the balancing code to wait for all the pending
+         * ordered extents
+         */
+        spinlock_t ordered_extent_lock;
+        struct list_head ordered_extents;
+        struct list_head delalloc_inodes;
+        /*
+         * there is a pool of worker threads for checksumming during writes
+         * and a pool for checksumming after reads.  This is because readers
+         * can run with FS locks held, and the writers may be waiting for
+         * those locks.  We don't want ordering in the pending list to cause
+         * deadlocks, and so the two are serviced separately.
+         *
+         * A third pool does submit_bio to avoid deadlocking with the other
+         * two
+         */
+        struct btrfs_workers workers;
+        struct btrfs_workers delalloc_workers;
+        struct btrfs_workers endio_workers;
+        struct btrfs_workers endio_meta_workers;
+        struct btrfs_workers endio_meta_write_workers;
+        struct btrfs_workers endio_write_workers;
+        struct btrfs_workers submit_workers;
+        /*
+         * fixup workers take dirty pages that didn't properly go through
+         * the cow mechanism and make them safe to write.  It happens
+         * for the sys_munmap function call path
+         */
+        struct btrfs_workers fixup_workers;
+        struct task_struct *transaction_kthread;
+        struct task_struct *cleaner_kthread;
+        int thread_pool_size;
+        /* tree relocation relocated fields */
+        struct list_head dead_reloc_roots;
+        struct btrfs_leaf_ref_tree reloc_ref_tree;
+        struct btrfs_leaf_ref_tree shared_ref_tree;
+        struct kobject super_kobj;
+        struct completion kobj_unregister;
+        int do_barriers;
+        int closing;
+        int log_root_recovering;
+        atomic_t throttles;
+        atomic_t throttle_gen;
+        u64 total_pinned;
+        struct list_head dirty_cowonly_roots;
+        struct btrfs_fs_devices *fs_devices;
+        struct list_head space_info;
+        spinlock_t delalloc_lock;
+        spinlock_t new_trans_lock;
+        u64 delalloc_bytes;
+        u64 last_alloc;
+        u64 last_data_alloc;
+        spinlock_t ref_cache_lock;
+        u64 total_ref_cache_size;
+        u64 avail_data_alloc_bits;
+        u64 avail_metadata_alloc_bits;
+        u64 avail_system_alloc_bits;
+        u64 data_alloc_profile;
+        u64 metadata_alloc_profile;
+        u64 system_alloc_profile;
+        void *bdev_holder;
+};
+/*
+ * in ram representation of the tree.  extent_root is used for all allocations
+ * and for the extent tree extent_root root.
+ */
+struct btrfs_dirty_root;
+struct btrfs_root {
+        struct extent_buffer *node;
+        /* the node lock is held while changing the node pointer */
+        spinlock_t node_lock;
+        struct extent_buffer *commit_root;
+        struct btrfs_leaf_ref_tree *ref_tree;
+        struct btrfs_leaf_ref_tree ref_tree_struct;
+        struct btrfs_dirty_root *dirty_root;
+        struct btrfs_root *log_root;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root_item root_item;
+        struct btrfs_key root_key;
+        struct btrfs_fs_info *fs_info;
+        struct extent_io_tree dirty_log_pages;
+        struct kobject root_kobj;
+        struct completion kobj_unregister;
+        struct mutex objectid_mutex;
+        struct mutex log_mutex;
+        u64 objectid;
+        u64 last_trans;
+        /* data allocations are done in sectorsize units */
+        u32 sectorsize;
+        /* node allocations are done in nodesize units */
+        u32 nodesize;
+        /* leaf allocations are done in leafsize units */
+        u32 leafsize;
+        u32 stripesize;
+        u32 type;
+        u64 highest_inode;
+        u64 last_inode_alloc;
+        int ref_cows;
+        int track_dirty;
+        u64 defrag_trans_start;
+        struct btrfs_key defrag_progress;
+        struct btrfs_key defrag_max;
+        int defrag_running;
+        int defrag_level;
+        char *name;
+        int in_sysfs;
+        /* the dirty list is only used by non-reference counted roots */
+        struct list_head dirty_list;
+        spinlock_t list_lock;
+        struct list_head dead_list;
+        struct list_head orphan_list;
+        /*
+         * right now this just gets used so that a root has its own devid
+         * for stat.  It may be used for more later
+         */
+        struct super_block anon_super;
+};
+/*
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics.  There is one for every file and dir in
+ * the FS
+ */
+#define BTRFS_INODE_ITEM_KEY            1
+#define BTRFS_INODE_REF_KEY             12
+#define BTRFS_XATTR_ITEM_KEY            24
+#define BTRFS_ORPHAN_ITEM_KEY           48
+/* reserve 2-15 close to the inode for later flexibility */
+/*
+ * dir items are the name -> inode pointers in a directory.  There is one
+ * for every name in a directory.
+ */
+#define BTRFS_DIR_LOG_ITEM_KEY  60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY      84
+#define BTRFS_DIR_INDEX_KEY     96
+/*
+ * extent data is for file data
+ */
+#define BTRFS_EXTENT_DATA_KEY   108
+/*
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
+ */
+#define BTRFS_EXTENT_CSUM_KEY   128
+/*
+ * root items point to tree roots.  There are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY     132
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY  144
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root.  They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY      156
+/*
+ * extent items are in the extent map tree.  These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY   168
+#define BTRFS_EXTENT_REF_KEY    180
+/*
+ * block groups give us hints into the extent allocation trees.  Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+#define BTRFS_DEV_EXTENT_KEY    204
+#define BTRFS_DEV_ITEM_KEY      216
+#define BTRFS_CHUNK_ITEM_KEY    228
+/*
+ * string items are for debugging.  They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY   253
+#define BTRFS_MOUNT_NODATASUM           (1 << 0)
+#define BTRFS_MOUNT_NODATACOW           (1 << 1)
+#define BTRFS_MOUNT_NOBARRIER           (1 << 2)
+#define BTRFS_MOUNT_SSD                 (1 << 3)
+#define BTRFS_MOUNT_DEGRADED            (1 << 4)
+#define BTRFS_MOUNT_COMPRESS            (1 << 5)
+#define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(root, opt)       ((root)->fs_info->mount_opt & \
+                                         BTRFS_MOUNT_##opt)
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM           (1 << 0)
+#define BTRFS_INODE_NODATACOW           (1 << 1)
+#define BTRFS_INODE_READONLY            (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS          (1 << 3)
+#define BTRFS_INODE_PREALLOC            (1 << 4)
+#define btrfs_clear_flag(inode, flag)   (BTRFS_I(inode)->flags &= \
+                                         ~BTRFS_INODE_##flag)
+#define btrfs_set_flag(inode, flag)     (BTRFS_I(inode)->flags |= \
+                                         BTRFS_INODE_##flag)
+#define btrfs_test_flag(inode, flag)    (BTRFS_I(inode)->flags & \
+                                         BTRFS_INODE_##flag)
+/* some macros to generate set/get funcs for the struct fields.  This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+#define read_eb_member(eb, ptr, type, member, result) (                 \
+        read_extent_buffer(eb, (char *)(result),                        \
+                           ((unsigned long)(ptr)) +                     \
+                            offsetof(type, member),                     \
+                           sizeof(((type *)0)->member)))
+#define write_eb_member(eb, ptr, type, member, result) (                \
+        write_extent_buffer(eb, (char *)(result),                       \
+                           ((unsigned long)(ptr)) +                     \
+                            offsetof(type, member),                     \
+                           sizeof(((type *)0)->member)))
+#ifndef BTRFS_SETGET_FUNCS
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)                    \
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);                \
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
+#endif
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)             \
+static inline u##bits btrfs_##name(struct extent_buffer *eb)            \
+{                                                                       \
+        type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+        u##bits res = le##bits##_to_cpu(p->member);                     \
+        kunmap_atomic(p, KM_USER0);                                     \
+        return res;                                                     \
+}                                                                       \
+static inline void btrfs_set_##name(struct extent_buffer *eb,           \
+                                    u##bits val)                        \
+{                                                                       \
+        type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+        p->member = cpu_to_le##bits(val);                               \
+        kunmap_atomic(p, KM_USER0);                                     \
+}
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)              \
+static inline u##bits btrfs_##name(type *s)                             \
+{                                                                       \
+        return le##bits##_to_cpu(s->member);                            \
+}                                                                       \
+static inline void btrfs_set_##name(type *s, u##bits val)               \
+{                                                                       \
+        s->member = cpu_to_le##bits(val);                               \
+}
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+                   start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+                         total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+                         bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+                         io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+                         io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+                         sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+                         dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+                         seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+                         bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+                         generation, 64);
+static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+        return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+}
+static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+        return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+}
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+        return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+                         stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+                         io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+                         io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+                         sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+                         num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+                         sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+                                                   int nr)
+{
+        unsigned long offset = (unsigned long)c;
+        offset += offsetof(struct btrfs_chunk, stripe);
+        offset += nr * sizeof(struct btrfs_stripe);
+        return (struct btrfs_stripe *)offset;
+}
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+        return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+                                         struct btrfs_chunk *c, int nr)
+{
+        return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+                                             struct btrfs_chunk *c, int nr,
+                                             u64 val)
+{
+        btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+                                         struct btrfs_chunk *c, int nr)
+{
+        return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+                                             struct btrfs_chunk *c, int nr,
+                                             u64 val)
+{
+        btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+                         used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+                         used, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+                        struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+                   struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+                   struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+                        struct btrfs_block_group_item, flags, 64);
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+static inline struct btrfs_timespec *
+btrfs_inode_atime(struct btrfs_inode_item *inode_item)
+{
+        unsigned long ptr = (unsigned long)inode_item;
+        ptr += offsetof(struct btrfs_inode_item, atime);
+        return (struct btrfs_timespec *)ptr;
+}
+static inline struct btrfs_timespec *
+btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
+{
+        unsigned long ptr = (unsigned long)inode_item;
+        ptr += offsetof(struct btrfs_inode_item, mtime);
+        return (struct btrfs_timespec *)ptr;
+}
+static inline struct btrfs_timespec *
+btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
+{
+        unsigned long ptr = (unsigned long)inode_item;
+        ptr += offsetof(struct btrfs_inode_item, ctime);
+        return (struct btrfs_timespec *)ptr;
+}
+static inline struct btrfs_timespec *
+btrfs_inode_otime(struct btrfs_inode_item *inode_item)
+{
+        unsigned long ptr = (unsigned long)inode_item;
+        ptr += offsetof(struct btrfs_inode_item, otime);
+        return (struct btrfs_timespec *)ptr;
+}
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+                   chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+                   chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+                   chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+        unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+        return (u8 *)((unsigned long)dev + ptr);
+}
+/* struct btrfs_extent_ref */
+BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
+                         generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
+                         objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
+                         num_refs, 32);
+/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
+                         refs, 32);
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+{
+        unsigned long ptr;
+        ptr = offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+        return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+                                           int nr, u64 val)
+{
+        unsigned long ptr;
+        ptr = offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+        btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+{
+        unsigned long ptr;
+        ptr = offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+        return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+                                                 int nr, u64 val)
+{
+        unsigned long ptr;
+        ptr = offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+        btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+        return offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+}
+void btrfs_node_key(struct extent_buffer *eb,
+                    struct btrfs_disk_key *disk_key, int nr);
+static inline void btrfs_set_node_key(struct extent_buffer *eb,
+                                      struct btrfs_disk_key *disk_key, int nr)
+{
+        unsigned long ptr;
+        ptr = btrfs_node_key_ptr_offset(nr);
+        write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+                       struct btrfs_key_ptr, key, disk_key);
+}
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+        return offsetof(struct btrfs_leaf, items) +
+                sizeof(struct btrfs_item) * nr;
+}
+static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
+                                               int nr)
+{
+        return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+static inline u32 btrfs_item_end(struct extent_buffer *eb,
+                                 struct btrfs_item *item)
+{
+        return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+}
+static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+{
+        return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+}
+static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+{
+        return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+}
+static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+{
+        return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+}
+static inline void btrfs_item_key(struct extent_buffer *eb,
+                           struct btrfs_disk_key *disk_key, int nr)
+{
+        struct btrfs_item *item = btrfs_item_nr(eb, nr);
+        read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+                               struct btrfs_disk_key *disk_key, int nr)
+{
+        struct btrfs_item *item = btrfs_item_nr(eb, nr);
+        write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+static inline void btrfs_dir_item_key(struct extent_buffer *eb,
+                                      struct btrfs_dir_item *item,
+                                      struct btrfs_disk_key *key)
+{
+        read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+                                          struct btrfs_dir_item *item,
+                                          struct btrfs_disk_key *key)
+{
+        write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+                         objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+                                         struct btrfs_disk_key *disk)
+{
+        cpu->offset = le64_to_cpu(disk->offset);
+        cpu->type = disk->type;
+        cpu->objectid = le64_to_cpu(disk->objectid);
+}
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+                                         struct btrfs_key *cpu)
+{
+        disk->offset = cpu_to_le64(cpu->offset);
+        disk->type = cpu->type;
+        disk->objectid = cpu_to_le64(cpu->objectid);
+}
+static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
+                                  struct btrfs_key *key, int nr)
+{
+        struct btrfs_disk_key disk_key;
+        btrfs_node_key(eb, &disk_key, nr);
+        btrfs_disk_key_to_cpu(key, &disk_key);
+}
+static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
+                                  struct btrfs_key *key, int nr)
+{
+        struct btrfs_disk_key disk_key;
+        btrfs_item_key(eb, &disk_key, nr);
+        btrfs_disk_key_to_cpu(key, &disk_key);
+}
+static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
+                                      struct btrfs_dir_item *item,
+                                      struct btrfs_key *key)
+{
+        struct btrfs_disk_key disk_key;
+        btrfs_dir_item_key(eb, item, &disk_key);
+        btrfs_disk_key_to_cpu(key, &disk_key);
+}
+static inline u8 btrfs_key_type(struct btrfs_key *key)
+{
+        return key->type;
+}
+static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
+{
+        key->type = val;
+}
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+                          generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+{
+        return (btrfs_header_flags(eb) & flag) == flag;
+}
+static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+        u64 flags = btrfs_header_flags(eb);
+        btrfs_set_header_flags(eb, flags | flag);
+        return (flags & flag) == flag;
+}
+static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+        u64 flags = btrfs_header_flags(eb);
+        btrfs_set_header_flags(eb, flags & ~flag);
+        return (flags & flag) == flag;
+}
+static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
+{
+        unsigned long ptr = offsetof(struct btrfs_header, fsid);
+        return (u8 *)ptr;
+}
+static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+        unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
+        return (u8 *)ptr;
+}
+static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
+{
+        unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
+        return (u8 *)ptr;
+}
+static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
+{
+        unsigned long ptr = offsetof(struct btrfs_header, csum);
+        return (u8 *)ptr;
+}
+static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
+{
+        return NULL;
+}
+static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
+{
+        return NULL;
+}
+static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
+{
+        return NULL;
+}
+static inline int btrfs_is_leaf(struct extent_buffer *eb)
+{
+        return btrfs_header_level(eb) == 0;
+}
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+                   generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+                         generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+                         last_snapshot, 64);
+/* struct btrfs_super_block */
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+                         generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+                         struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+                         struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+                         root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+                         chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+                         chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+                         log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+                         log_root_transid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+                         log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+                         total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+                         bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+                         sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+                         nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
+                         leafsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+                         stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+                         root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+                         num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+                         compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+                         compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+                         incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+                         csum_type, 16);
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+        int t = btrfs_super_csum_type(s);
+        BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+        return btrfs_csum_sizes[t];
+}
+static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
+{
+        return offsetof(struct btrfs_leaf, items);
+}
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+{
+        unsigned long offset = (unsigned long)e;
+        offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
+        return offset;
+}
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+        return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+}
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+                   disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+                   generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+                   disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+                  offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+                   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+                   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+                   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+                   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+                   other_encoding, 16);
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+                                               struct btrfs_file_extent_item *e)
+{
+        return btrfs_file_extent_ram_bytes(eb, e);
+}
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+                                                    struct btrfs_item *e)
+{
+        unsigned long offset;
+        offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+        return btrfs_item_size(eb, e) - offset;
+}
+static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline int btrfs_set_root_name(struct btrfs_root *root,
+                                      const char *name, int len)
+{
+        /* if we already have a name just free it */
+        kfree(root->name);
+        root->name = kmalloc(len+1, GFP_KERNEL);
+        if (!root->name)
+                return -ENOMEM;
+        memcpy(root->name, name, len);
+        root->name[len] = '\0';
+        return 0;
+}
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
+{
+        if (level == 0)
+                return root->leafsize;
+        return root->nodesize;
+}
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+        ((type *)(btrfs_leaf_data(leaf) + \
+        btrfs_item_offset_nr(leaf, slot)))
+#define btrfs_item_ptr_offset(leaf, slot) \
+        ((unsigned long)(btrfs_leaf_data(leaf) + \
+        btrfs_item_offset_nr(leaf, slot)))
+static inline struct dentry *fdentry(struct file *file)
+{
+        return file->f_path.dentry;
+}
+/* extent-tree.c */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 num_bytes, u32 *refs);
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+                                u64 bytenr, u64 num, int pin);
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 objectid, u64 bytenr);
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root);
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+                                                 struct btrfs_fs_info *info,
+                                                 u64 bytenr);
+u64 btrfs_find_block_group(struct btrfs_root *root,
+                           u64 search_start, u64 search_hint, int owner);
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root,
+                                             u32 blocksize, u64 parent,
+                                             u64 root_objectid,
+                                             u64 ref_generation,
+                                             int level,
+                                             u64 hint,
+                                             u64 empty_size);
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+                                            struct btrfs_root *root,
+                                            u64 bytenr, u32 blocksize);
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       u64 num_bytes, u64 parent, u64 min_bytes,
+                       u64 root_objectid, u64 ref_generation,
+                       u64 owner, u64 empty_size, u64 hint_byte,
+                       u64 search_end, struct btrfs_key *ins, u64 data);
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, u64 parent,
+                                u64 root_objectid, u64 ref_generation,
+                                u64 owner, struct btrfs_key *ins);
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, u64 parent,
+                                u64 root_objectid, u64 ref_generation,
+                                u64 owner, struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  u64 num_bytes, u64 min_alloc_size,
+                                  u64 empty_size, u64 hint_byte,
+                                  u64 search_end, struct btrfs_key *ins,
+                                  u64 data);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+                  u32 *nr_extents);
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                    struct extent_buffer *buf, u32 nr_extents);
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                     struct extent_buffer *buf, int start_slot, int nr);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      u64 bytenr, u64 num_bytes, u64 parent,
+                      u64 root_objectid, u64 ref_generation,
+                      u64 owner_objectid, int pin);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct extent_io_tree *unpin);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         u64 bytenr, u64 num_bytes, u64 parent,
+                         u64 root_objectid, u64 ref_generation,
+                         u64 owner_objectid);
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 orig_parent, u64 parent,
+                            u64 root_objectid, u64 ref_generation,
+                            u64 owner_objectid);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 bytes_used,
+                           u64 type, u64 chunk_objectid, u64 chunk_offset,
+                           u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root);
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct extent_buffer *buf, u64 orig_start);
+int btrfs_add_dead_reloc_root(struct btrfs_root *root);
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+/* ctree.c */
+int btrfs_previous_item(struct btrfs_root *root,
+                        struct btrfs_path *path, u64 min_objectid,
+                        int type);
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_key *node_keys,
+                     u64 *nodes, int lowest_level);
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, struct btrfs_path *path,
+                            struct btrfs_key *new_key);
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+                        struct btrfs_key *key, int lowest_level,
+                        int cache_only, u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+                         struct btrfs_key *max_key,
+                         struct btrfs_path *path, int cache_only,
+                         u64 min_trans);
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct extent_buffer *buf,
+                    struct extent_buffer *parent, int parent_slot,
+                    struct extent_buffer **cow_ret, u64 prealloc_dest);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      struct extent_buffer *buf,
+                      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_path *path, u32 data_size);
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_path *path,
+                     struct btrfs_key *new_key,
+                     unsigned long split_offset);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_path *p, int
+                      ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct extent_buffer *parent,
+                       int start_slot, int cache_only, u64 *last_ret,
+                       struct btrfs_key *progress);
+void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
+void btrfs_init_path(struct btrfs_path *p);
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_path *path, int slot, int nr);
+int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path, u64 bytenr);
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path)
+{
+        return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct btrfs_key *cpu_key, u32 *data_size,
+                            int nr);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path,
+                             struct btrfs_key *cpu_key, u32 *data_size, int nr);
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          struct btrfs_key *key,
+                                          u32 data_size)
+{
+        return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+}
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+                        *root);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct extent_buffer *node,
+                        struct extent_buffer *parent);
+/* root-item.c */
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+                   struct btrfs_path *path,
+                   u64 root_id, u64 ref_id);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *tree_root,
+                       u64 root_id, u8 type, u64 ref_id,
+                       u64 dirid, u64 sequence,
+                       const char *name, int name_len);
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_root_item
+                      *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_root_item
+                      *item);
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+                         btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+                      u64 *found_objectid);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+                          struct btrfs_root *latest_root);
+/* dir-item.c */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, const char *name,
+                          int name_len, u64 dir,
+                          struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root,
+                                             struct btrfs_path *path, u64 dir,
+                                             const char *name, int name_len,
+                                             int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path, u64 dir,
+                            u64 objectid, const char *name, int name_len,
+                            int mod);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              const char *name, int name_len);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, const char *name,
+                            u16 name_len, const void *data, u16 data_len,
+                            u64 dir);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path, u64 dir,
+                                          const char *name, u16 name_len,
+                                          int mod);
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 offset);
+/* inode-map.c */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *fs_root,
+                             u64 dirid, u64 *objectid);
+int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
+/* inode-item.c */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           const char *name, int name_len,
+                           u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           const char *name, int name_len,
+                           u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+                       *root, struct btrfs_path *path,
+                       struct btrfs_key *location, int mod);
+/* file-item.c */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, u64 bytenr, u64 len);
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                          struct bio *bio, u32 *dst);
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             u64 objectid, u64 pos,
+                             u64 disk_offset, u64 disk_num_bytes,
+                             u64 num_bytes, u64 offset, u64 ram_bytes,
+                             u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid,
+                             u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_ordered_sum *sums);
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+                       struct bio *bio, u64 file_start, int contig);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+                          u64 start, unsigned long len);
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, int cow);
+int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, struct btrfs_path *path,
+                        u64 isize);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
+                             u64 end, struct list_head *list);
+/* inode.c */
+/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
+#define ClearPageChecked ClearPageFsMisc
+#define SetPageChecked SetPageFsMisc
+#define PageChecked PageFsMisc
+#endif
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct inode *dir, struct inode *inode,
+                       const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+                   struct inode *parent_inode, struct inode *inode,
+                   const char *name, int name_len, int add_backref, u64 index);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *inode, u64 new_size,
+                               u32 min_type);
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+int btrfs_writepages(struct address_space *mapping,
+                     struct writeback_control *wbc);
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *new_root, struct dentry *dentry,
+                             u64 new_dirid, u64 alloc_hint);
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+                         size_t size, struct bio *bio, unsigned long bio_flags);
+unsigned long btrfs_force_ra(struct address_space *mapping,
+                              struct file_ra_state *ra, struct file *file,
+                              pgoff_t offset, pgoff_t last_index);
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+                           int for_del);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_readpage(struct file *file, struct page *page);
+void btrfs_delete_inode(struct inode *inode);
+void btrfs_put_inode(struct inode *inode);
+void btrfs_read_locked_inode(struct inode *inode);
+int btrfs_write_inode(struct inode *inode, int wait);
+void btrfs_dirty_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+int btrfs_init_cachep(void);
+void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+                            struct btrfs_root *root, int wait);
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+                                struct btrfs_root *root);
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+                         struct btrfs_root *root, int *is_new);
+int btrfs_commit_write(struct file *file, struct page *page,
+                       unsigned from, unsigned to);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+                                    size_t page_offset, u64 start, u64 end,
+                                    int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+void btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t size);
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+/* file.c */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+                            int skip_pinned);
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
+extern struct file_operations btrfs_file_operations;
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct inode *inode,
+                       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct inode *inode, u64 start, u64 end);
+int btrfs_release_file(struct inode *inode, struct file *file);
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, int cache_only);
+/* sysfs.c */
+int btrfs_init_sysfs(void);
+void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
+int btrfs_sysfs_add_root(struct btrfs_root *root);
+void btrfs_sysfs_del_root(struct btrfs_root *root);
+void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
+/* xattr.c */
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+/* super.c */
+u64 btrfs_parse_size(char *str);
+int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+/* acl.c */
+int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_init_acl(struct inode *inode, struct inode *dir);
+int btrfs_acl_chmod(struct inode *inode);
+/* free-space-cache.c */
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                         u64 bytenr, u64 size);
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+                              u64 offset, u64 bytes);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                            u64 bytenr, u64 size);
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+                                 u64 offset, u64 bytes);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+                                   *block_group);
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+                                               *block_group, u64 offset,
+                                               u64 bytes);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+                           u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..926a0b287a7d
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision.  data_size indicates how big the item inserted should be.  On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+                                                   *trans,
+                                                   struct btrfs_root *root,
+                                                   struct btrfs_path *path,
+                                                   struct btrfs_key *cpu_key,
+                                                   u32 data_size,
+                                                   const char *name,
+                                                   int name_len)
+{
+        int ret;
+        char *ptr;
+        struct btrfs_item *item;
+        struct extent_buffer *leaf;
+        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+        if (ret == -EEXIST) {
+                struct btrfs_dir_item *di;
+                di = btrfs_match_dir_item_name(root, path, name, name_len);
+                if (di)
+                        return ERR_PTR(-EEXIST);
+                ret = btrfs_extend_item(trans, root, path, data_size);
+                WARN_ON(ret > 0);
+        }
+        if (ret < 0)
+                return ERR_PTR(ret);
+        WARN_ON(ret > 0);
+        leaf = path->nodes[0];
+        item = btrfs_item_nr(leaf, path->slots[0]);
+        ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+        BUG_ON(data_size > btrfs_item_size(leaf, item));
+        ptr += btrfs_item_size(leaf, item) - data_size;
+        return (struct btrfs_dir_item *)ptr;
+}
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, const char *name,
+                            u16 name_len, const void *data, u16 data_len,
+                            u64 dir)
+{
+        int ret = 0;
+        struct btrfs_path *path;
+        struct btrfs_dir_item *dir_item;
+        unsigned long name_ptr, data_ptr;
+        struct btrfs_key key, location;
+        struct btrfs_disk_key disk_key;
+        struct extent_buffer *leaf;
+        u32 data_size;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        if (name_len + data_len + sizeof(struct btrfs_dir_item) >
+            BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
+                return -ENOSPC;
+        data_size = sizeof(*dir_item) + name_len + data_len;
+        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+                                        name, name_len);
+        /*
+         * FIXME: at some point we should handle xattr's that are larger than
+         * what we can fit in our leaf.  We set location to NULL b/c we arent
+         * pointing at anything else, that will change if we store the xattr
+         * data in a separate inode.
+         */
+        BUG_ON(IS_ERR(dir_item));
+        memset(&location, 0, sizeof(location));
+        leaf = path->nodes[0];
+        btrfs_cpu_key_to_disk(&disk_key, &location);
+        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+        btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+        btrfs_set_dir_name_len(leaf, dir_item, name_len);
+        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+        btrfs_set_dir_data_len(leaf, dir_item, data_len);
+        name_ptr = (unsigned long)(dir_item + 1);
+        data_ptr = (unsigned long)((char *)name_ptr + name_len);
+        write_extent_buffer(leaf, name, name_ptr, name_len);
+        write_extent_buffer(leaf, data, data_ptr, data_len);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, const char *name, int name_len, u64 dir,
+                          struct btrfs_key *location, u8 type, u64 index)
+{
+        int ret = 0;
+        int ret2 = 0;
+        struct btrfs_path *path;
+        struct btrfs_dir_item *dir_item;
+        struct extent_buffer *leaf;
+        unsigned long name_ptr;
+        struct btrfs_key key;
+        struct btrfs_disk_key disk_key;
+        u32 data_size;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        path = btrfs_alloc_path();
+        data_size = sizeof(*dir_item) + name_len;
+        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+                                        name, name_len);
+        if (IS_ERR(dir_item)) {
+                ret = PTR_ERR(dir_item);
+                if (ret == -EEXIST)
+                        goto second_insert;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        btrfs_cpu_key_to_disk(&disk_key, location);
+        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+        btrfs_set_dir_type(leaf, dir_item, type);
+        btrfs_set_dir_data_len(leaf, dir_item, 0);
+        btrfs_set_dir_name_len(leaf, dir_item, name_len);
+        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+        name_ptr = (unsigned long)(dir_item + 1);
+        write_extent_buffer(leaf, name, name_ptr, name_len);
+        btrfs_mark_buffer_dirty(leaf);
+second_insert:
+        /* FIXME, use some real flag for selecting the extra index */
+        if (root == root->fs_info->tree_root) {
+                ret = 0;
+                goto out;
+        }
+        btrfs_release_path(root, path);
+        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+        key.offset = index;
+        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+                                        name, name_len);
+        if (IS_ERR(dir_item)) {
+                ret2 = PTR_ERR(dir_item);
+                goto out;
+        }
+        leaf = path->nodes[0];
+        btrfs_cpu_key_to_disk(&disk_key, location);
+        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+        btrfs_set_dir_type(leaf, dir_item, type);
+        btrfs_set_dir_data_len(leaf, dir_item, 0);
+        btrfs_set_dir_name_len(leaf, dir_item, name_len);
+        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+        name_ptr = (unsigned long)(dir_item + 1);
+        write_extent_buffer(leaf, name, name_ptr, name_len);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        if (ret)
+                return ret;
+        if (ret2)
+                return ret2;
+        return 0;
+}
+/*
+ * lookup a directory item based on name.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root,
+                                             struct btrfs_path *path, u64 dir,
+                                             const char *name, int name_len,
+                                             int mod)
+{
+        int ret;
+        struct btrfs_key key;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0) {
+                if (path->slots[0] == 0)
+                        return NULL;
+                path->slots[0]--;
+        }
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        if (found_key.objectid != dir ||
+            btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
+            found_key.offset != key.offset)
+                return NULL;
+        return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+/*
+ * lookup a directory item based on index.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path, u64 dir,
+                            u64 objectid, const char *name, int name_len,
+                            int mod)
+{
+        int ret;
+        struct btrfs_key key;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+        key.offset = objectid;
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0)
+                return ERR_PTR(-ENOENT);
+        return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path, u64 dir,
+                                          const char *name, u16 name_len,
+                                          int mod)
+{
+        int ret;
+        struct btrfs_key key;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0) {
+                if (path->slots[0] == 0)
+                        return NULL;
+                path->slots[0]--;
+        }
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        if (found_key.objectid != dir ||
+            btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
+            found_key.offset != key.offset)
+                return NULL;
+        return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              const char *name, int name_len)
+{
+        struct btrfs_dir_item *dir_item;
+        unsigned long name_ptr;
+        u32 total_len;
+        u32 cur = 0;
+        u32 this_len;
+        struct extent_buffer *leaf;
+        leaf = path->nodes[0];
+        dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+        total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+        while (cur < total_len) {
+                this_len = sizeof(*dir_item) +
+                        btrfs_dir_name_len(leaf, dir_item) +
+                        btrfs_dir_data_len(leaf, dir_item);
+                name_ptr = (unsigned long)(dir_item + 1);
+                if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+                    memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+                        return dir_item;
+                cur += this_len;
+                dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+                                                     this_len);
+        }
+        return NULL;
+}
+/*
+ * given a pointer into a directory item, delete it.  This
+ * handles items that have more than one entry in them.
+ */
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              struct btrfs_dir_item *di)
+{
+        struct extent_buffer *leaf;
+        u32 sub_item_len;
+        u32 item_len;
+        int ret = 0;
+        leaf = path->nodes[0];
+        sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+                btrfs_dir_data_len(leaf, di);
+        item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+        if (sub_item_len == item_len) {
+                ret = btrfs_del_item(trans, root, path);
+        } else {
+                /* MARKER */
+                unsigned long ptr = (unsigned long)di;
+                unsigned long start;
+                start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+                        item_len - (ptr + sub_item_len - start));
+                ret = btrfs_truncate_item(trans, root, path,
+                                          item_len - sub_item_len, 1);
+        }
+        return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..81a313874ae5
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include "compat.h"
+#include "crc32c.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "async-thread.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "tree-log.h"
+static struct extent_io_ops btree_extent_io_ops;
+static void end_workqueue_fn(struct btrfs_work *work);
+/*
+ * end_io_wq structs are used to do processing in task context when an IO is
+ * complete.  This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
+struct end_io_wq {
+        struct bio *bio;
+        bio_end_io_t *end_io;
+        void *private;
+        struct btrfs_fs_info *info;
+        int error;
+        int metadata;
+        struct list_head list;
+        struct btrfs_work work;
+};
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads.  They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+        struct inode *inode;
+        struct bio *bio;
+        struct list_head list;
+        extent_submit_bio_hook_t *submit_bio_start;
+        extent_submit_bio_hook_t *submit_bio_done;
+        int rw;
+        int mirror_num;
+        unsigned long bio_flags;
+        struct btrfs_work work;
+};
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
+static struct extent_map *btree_get_extent(struct inode *inode,
+                struct page *page, size_t page_offset, u64 start, u64 len,
+                int create)
+{
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
+        int ret;
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, start, len);
+        if (em) {
+                em->bdev =
+                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+                spin_unlock(&em_tree->lock);
+                goto out;
+        }
+        spin_unlock(&em_tree->lock);
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                em = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        em->start = 0;
+        em->len = (u64)-1;
+        em->block_len = (u64)-1;
+        em->block_start = 0;
+        em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        spin_lock(&em_tree->lock);
+        ret = add_extent_mapping(em_tree, em);
+        if (ret == -EEXIST) {
+                u64 failed_start = em->start;
+                u64 failed_len = em->len;
+                free_extent_map(em);
+                em = lookup_extent_mapping(em_tree, start, len);
+                if (em) {
+                        ret = 0;
+                } else {
+                        em = lookup_extent_mapping(em_tree, failed_start,
+                                                   failed_len);
+                        ret = -EIO;
+                }
+        } else if (ret) {
+                free_extent_map(em);
+                em = NULL;
+        }
+        spin_unlock(&em_tree->lock);
+        if (ret)
+                em = ERR_PTR(ret);
+out:
+        return em;
+}
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+{
+        return btrfs_crc32c(seed, data, len);
+}
+void btrfs_csum_final(u32 crc, char *result)
+{
+        *(__le32 *)result = ~cpu_to_le32(crc);
+}
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+                           int verify)
+{
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        char *result = NULL;
+        unsigned long len;
+        unsigned long cur_len;
+        unsigned long offset = BTRFS_CSUM_SIZE;
+        char *map_token = NULL;
+        char *kaddr;
+        unsigned long map_start;
+        unsigned long map_len;
+        int err;
+        u32 crc = ~(u32)0;
+        unsigned long inline_result;
+        len = buf->len - offset;
+        while (len > 0) {
+                err = map_private_extent_buffer(buf, offset, 32,
+                                        &map_token, &kaddr,
+                                        &map_start, &map_len, KM_USER0);
+                if (err)
+                        return 1;
+                cur_len = min(len, map_len - (offset - map_start));
+                crc = btrfs_csum_data(root, kaddr + offset - map_start,
+                                      crc, cur_len);
+                len -= cur_len;
+                offset += cur_len;
+                unmap_extent_buffer(buf, map_token, KM_USER0);
+        }
+        if (csum_size > sizeof(inline_result)) {
+                result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+                if (!result)
+                        return 1;
+        } else {
+                result = (char *)&inline_result;
+        }
+        btrfs_csum_final(crc, result);
+        if (verify) {
+                if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
+                        u32 val;
+                        u32 found = 0;
+                        memcpy(&found, result, csum_size);
+                        read_extent_buffer(buf, &val, 0, csum_size);
+                        printk(KERN_INFO "btrfs: %s checksum verify failed "
+                               "on %llu wanted %X found %X level %d\n",
+                               root->fs_info->sb->s_id,
+                               buf->start, val, found, btrfs_header_level(buf));
+                        if (result != (char *)&inline_result)
+                                kfree(result);
+                        return 1;
+                }
+        } else {
+                write_extent_buffer(buf, result, 0, csum_size);
+        }
+        if (result != (char *)&inline_result)
+                kfree(result);
+        return 0;
+}
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer.  This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+                                 struct extent_buffer *eb, u64 parent_transid)
+{
+        int ret;
+        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+                return 0;
+        lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+        if (extent_buffer_uptodate(io_tree, eb) &&
+            btrfs_header_generation(eb) == parent_transid) {
+                ret = 0;
+                goto out;
+        }
+        printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+               (unsigned long long)eb->start,
+               (unsigned long long)parent_transid,
+               (unsigned long long)btrfs_header_generation(eb));
+        ret = 1;
+        clear_extent_buffer_uptodate(io_tree, eb);
+out:
+        unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+                      GFP_NOFS);
+        return ret;
+}
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+                                          struct extent_buffer *eb,
+                                          u64 start, u64 parent_transid)
+{
+        struct extent_io_tree *io_tree;
+        int ret;
+        int num_copies = 0;
+        int mirror_num = 0;
+        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+        while (1) {
+                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+                                               btree_get_extent, mirror_num);
+                if (!ret &&
+                    !verify_parent_transid(io_tree, eb, parent_transid))
+                        return ret;
+                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+                                              eb->start, eb->len);
+                if (num_copies == 1)
+                        return ret;
+                mirror_num++;
+                if (mirror_num > num_copies)
+                        return ret;
+        }
+        return -EIO;
+}
+/*
+ * checksum a dirty tree block before IO.  This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
+ */
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+{
+        struct extent_io_tree *tree;
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 found_start;
+        int found_level;
+        unsigned long len;
+        struct extent_buffer *eb;
+        int ret;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (page->private == EXTENT_PAGE_PRIVATE)
+                goto out;
+        if (!page->private)
+                goto out;
+        len = page->private >> 2;
+        WARN_ON(len == 0);
+        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
+                                             btrfs_header_generation(eb));
+        BUG_ON(ret);
+        found_start = btrfs_header_bytenr(eb);
+        if (found_start != start) {
+                WARN_ON(1);
+                goto err;
+        }
+        if (eb->first_page != page) {
+                WARN_ON(1);
+                goto err;
+        }
+        if (!PageUptodate(page)) {
+                WARN_ON(1);
+                goto err;
+        }
+        found_level = btrfs_header_level(eb);
+        csum_tree_block(root, eb, 0);
+err:
+        free_extent_buffer(eb);
+out:
+        return 0;
+}
+static int check_tree_block_fsid(struct btrfs_root *root,
+                                 struct extent_buffer *eb)
+{
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        u8 fsid[BTRFS_UUID_SIZE];
+        int ret = 1;
+        read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
+                           BTRFS_FSID_SIZE);
+        while (fs_devices) {
+                if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+                        ret = 0;
+                        break;
+                }
+                fs_devices = fs_devices->seed;
+        }
+        return ret;
+}
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+                               struct extent_state *state)
+{
+        struct extent_io_tree *tree;
+        u64 found_start;
+        int found_level;
+        unsigned long len;
+        struct extent_buffer *eb;
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        int ret = 0;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (page->private == EXTENT_PAGE_PRIVATE)
+                goto out;
+        if (!page->private)
+                goto out;
+        len = page->private >> 2;
+        WARN_ON(len == 0);
+        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        found_start = btrfs_header_bytenr(eb);
+        if (found_start != start) {
+                printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
+                       (unsigned long long)found_start,
+                       (unsigned long long)eb->start);
+                ret = -EIO;
+                goto err;
+        }
+        if (eb->first_page != page) {
+                printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+                       eb->first_page->index, page->index);
+                WARN_ON(1);
+                ret = -EIO;
+                goto err;
+        }
+        if (check_tree_block_fsid(root, eb)) {
+                printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+                       (unsigned long long)eb->start);
+                ret = -EIO;
+                goto err;
+        }
+        found_level = btrfs_header_level(eb);
+        ret = csum_tree_block(root, eb, 1);
+        if (ret)
+                ret = -EIO;
+        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
+        end = eb->start + end - 1;
+err:
+        free_extent_buffer(eb);
+out:
+        return ret;
+}
+static void end_workqueue_bio(struct bio *bio, int err)
+{
+        struct end_io_wq *end_io_wq = bio->bi_private;
+        struct btrfs_fs_info *fs_info;
+        fs_info = end_io_wq->info;
+        end_io_wq->error = err;
+        end_io_wq->work.func = end_workqueue_fn;
+        end_io_wq->work.flags = 0;
+        if (bio->bi_rw & (1 << BIO_RW)) {
+                if (end_io_wq->metadata)
+                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
+                                           &end_io_wq->work);
+                else
+                        btrfs_queue_worker(&fs_info->endio_write_workers,
+                                           &end_io_wq->work);
+        } else {
+                if (end_io_wq->metadata)
+                        btrfs_queue_worker(&fs_info->endio_meta_workers,
+                                           &end_io_wq->work);
+                else
+                        btrfs_queue_worker(&fs_info->endio_workers,
+                                           &end_io_wq->work);
+        }
+}
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+                        int metadata)
+{
+        struct end_io_wq *end_io_wq;
+        end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+        if (!end_io_wq)
+                return -ENOMEM;
+        end_io_wq->private = bio->bi_private;
+        end_io_wq->end_io = bio->bi_end_io;
+        end_io_wq->info = info;
+        end_io_wq->error = 0;
+        end_io_wq->bio = bio;
+        end_io_wq->metadata = metadata;
+        bio->bi_private = end_io_wq;
+        bio->bi_end_io = end_workqueue_bio;
+        return 0;
+}
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
+{
+        unsigned long limit = min_t(unsigned long,
+                                    info->workers.max_workers,
+                                    info->fs_devices->open_devices);
+        return 256 * limit;
+}
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
+{
+        return atomic_read(&info->nr_async_bios) >
+                btrfs_async_submit_limit(info);
+}
+static void run_one_async_start(struct btrfs_work *work)
+{
+        struct btrfs_fs_info *fs_info;
+        struct async_submit_bio *async;
+        async = container_of(work, struct  async_submit_bio, work);
+        fs_info = BTRFS_I(async->inode)->root->fs_info;
+        async->submit_bio_start(async->inode, async->rw, async->bio,
+                               async->mirror_num, async->bio_flags);
+}
+static void run_one_async_done(struct btrfs_work *work)
+{
+        struct btrfs_fs_info *fs_info;
+        struct async_submit_bio *async;
+        int limit;
+        async = container_of(work, struct  async_submit_bio, work);
+        fs_info = BTRFS_I(async->inode)->root->fs_info;
+        limit = btrfs_async_submit_limit(fs_info);
+        limit = limit * 2 / 3;
+        atomic_dec(&fs_info->nr_async_submits);
+        if (atomic_read(&fs_info->nr_async_submits) < limit &&
+            waitqueue_active(&fs_info->async_submit_wait))
+                wake_up(&fs_info->async_submit_wait);
+        async->submit_bio_done(async->inode, async->rw, async->bio,
+                               async->mirror_num, async->bio_flags);
+}
+static void run_one_async_free(struct btrfs_work *work)
+{
+        struct async_submit_bio *async;
+        async = container_of(work, struct  async_submit_bio, work);
+        kfree(async);
+}
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+                        int rw, struct bio *bio, int mirror_num,
+                        unsigned long bio_flags,
+                        extent_submit_bio_hook_t *submit_bio_start,
+                        extent_submit_bio_hook_t *submit_bio_done)
+{
+        struct async_submit_bio *async;
+        async = kmalloc(sizeof(*async), GFP_NOFS);
+        if (!async)
+                return -ENOMEM;
+        async->inode = inode;
+        async->rw = rw;
+        async->bio = bio;
+        async->mirror_num = mirror_num;
+        async->submit_bio_start = submit_bio_start;
+        async->submit_bio_done = submit_bio_done;
+        async->work.func = run_one_async_start;
+        async->work.ordered_func = run_one_async_done;
+        async->work.ordered_free = run_one_async_free;
+        async->work.flags = 0;
+        async->bio_flags = bio_flags;
+        atomic_inc(&fs_info->nr_async_submits);
+        btrfs_queue_worker(&fs_info->workers, &async->work);
+#if 0
+        int limit = btrfs_async_submit_limit(fs_info);
+        if (atomic_read(&fs_info->nr_async_submits) > limit) {
+                wait_event_timeout(fs_info->async_submit_wait,
+                           (atomic_read(&fs_info->nr_async_submits) < limit),
+                           HZ/10);
+                wait_event_timeout(fs_info->async_submit_wait,
+                           (atomic_read(&fs_info->nr_async_bios) < limit),
+                           HZ/10);
+        }
+#endif
+        while (atomic_read(&fs_info->async_submit_draining) &&
+              atomic_read(&fs_info->nr_async_submits)) {
+                wait_event(fs_info->async_submit_wait,
+                           (atomic_read(&fs_info->nr_async_submits) == 0));
+        }
+        return 0;
+}
+static int btree_csum_one_bio(struct bio *bio)
+{
+        struct bio_vec *bvec = bio->bi_io_vec;
+        int bio_index = 0;
+        struct btrfs_root *root;
+        WARN_ON(bio->bi_vcnt <= 0);
+        while (bio_index < bio->bi_vcnt) {
+                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+                csum_dirty_buffer(root, bvec->bv_page);
+                bio_index++;
+                bvec++;
+        }
+        return 0;
+}
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+                                    struct bio *bio, int mirror_num,
+                                    unsigned long bio_flags)
+{
+        /*
+         * when we're called for a write, we're already in the async
+         * submission context.  Just jump into btrfs_map_bio
+         */
+        btree_csum_one_bio(bio);
+        return 0;
+}
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags)
+{
+        /*
+         * when we're called for a write, we're already in the async
+         * submission context.  Just jump into btrfs_map_bio
+         */
+        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+}
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags)
+{
+        int ret;
+        ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+                                          bio, 1);
+        BUG_ON(ret);
+        if (!(rw & (1 << BIO_RW))) {
+                /*
+                 * called for a read, do the setup so that checksum validation
+                 * can happen in the async kernel threads
+                 */
+                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                     mirror_num, 0);
+        }
+        /*
+         * kthread helpers are used to submit writes so that checksumming
+         * can happen in parallel across all CPUs
+         */
+        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                   inode, rw, bio, mirror_num, 0,
+                                   __btree_submit_bio_start,
+                                   __btree_submit_bio_done);
+}
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (current->flags & PF_MEMALLOC) {
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return 0;
+        }
+        return extent_write_full_page(tree, page, btree_get_extent, wbc);
+}
+static int btree_writepages(struct address_space *mapping,
+                            struct writeback_control *wbc)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(mapping->host)->io_tree;
+        if (wbc->sync_mode == WB_SYNC_NONE) {
+                u64 num_dirty;
+                u64 start = 0;
+                unsigned long thresh = 32 * 1024 * 1024;
+                if (wbc->for_kupdate)
+                        return 0;
+                num_dirty = count_range_bits(tree, &start, (u64)-1,
+                                             thresh, EXTENT_DIRTY);
+                if (num_dirty < thresh)
+                        return 0;
+        }
+        return extent_writepages(tree, mapping, btree_get_extent, wbc);
+}
+static int btree_readpage(struct file *file, struct page *page)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        return extent_read_full_page(tree, page, btree_get_extent);
+}
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+{
+        struct extent_io_tree *tree;
+        struct extent_map_tree *map;
+        int ret;
+        if (PageWriteback(page) || PageDirty(page))
+                return 0;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        map = &BTRFS_I(page->mapping->host)->extent_tree;
+        ret = try_release_extent_state(map, tree, page, gfp_flags);
+        if (!ret)
+                return 0;
+        ret = try_release_extent_buffer(tree, page);
+        if (ret == 1) {
+                ClearPagePrivate(page);
+                set_page_private(page, 0);
+                page_cache_release(page);
+        }
+        return ret;
+}
+static void btree_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        extent_invalidatepage(tree, page, offset);
+        btree_releasepage(page, GFP_NOFS);
+        if (PagePrivate(page)) {
+                printk(KERN_WARNING "btrfs warning page private not zero "
+                       "on page %llu\n", (unsigned long long)page_offset(page));
+                ClearPagePrivate(page);
+                set_page_private(page, 0);
+                page_cache_release(page);
+        }
+}
+#if 0
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct buffer_head *bh;
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        struct buffer_head *head;
+        if (!page_has_buffers(page)) {
+                create_empty_buffers(page, root->fs_info->sb->s_blocksize,
+                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
+        }
+        head = page_buffers(page);
+        bh = head;
+        do {
+                if (buffer_dirty(bh))
+                        csum_tree_block(root, bh, 0);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return block_write_full_page(page, btree_get_block, wbc);
+}
+#endif
+static struct address_space_operations btree_aops = {
+        .readpage       = btree_readpage,
+        .writepage      = btree_writepage,
+        .writepages     = btree_writepages,
+        .releasepage    = btree_releasepage,
+        .invalidatepage = btree_invalidatepage,
+        .sync_page      = block_sync_page,
+};
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                         u64 parent_transid)
+{
+        struct extent_buffer *buf = NULL;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        int ret = 0;
+        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        if (!buf)
+                return 0;
+        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+                                 buf, 0, 0, btree_get_extent, 0);
+        free_extent_buffer(buf);
+        return ret;
+}
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+                                            u64 bytenr, u32 blocksize)
+{
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_buffer *eb;
+        eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+                                bytenr, blocksize, GFP_NOFS);
+        return eb;
+}
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+                                                 u64 bytenr, u32 blocksize)
+{
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_buffer *eb;
+        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+                                 bytenr, blocksize, NULL, GFP_NOFS);
+        return eb;
+}
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+        return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
+                                      buf->start + buf->len - 1, WB_SYNC_ALL);
+}
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+        return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+                                  buf->start, buf->start + buf->len - 1);
+}
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+                                      u32 blocksize, u64 parent_transid)
+{
+        struct extent_buffer *buf = NULL;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_io_tree *io_tree;
+        int ret;
+        io_tree = &BTRFS_I(btree_inode)->io_tree;
+        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        if (!buf)
+                return NULL;
+        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+        if (ret == 0)
+                buf->flags |= EXTENT_UPTODATE;
+        else
+                WARN_ON(1);
+        return buf;
+}
+int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                     struct extent_buffer *buf)
+{
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        if (btrfs_header_generation(buf) ==
+            root->fs_info->running_transaction->transid) {
+                WARN_ON(!btrfs_tree_locked(buf));
+                clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+                                          buf);
+        }
+        return 0;
+}
+static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+                        u32 stripesize, struct btrfs_root *root,
+                        struct btrfs_fs_info *fs_info,
+                        u64 objectid)
+{
+        root->node = NULL;
+        root->commit_root = NULL;
+        root->ref_tree = NULL;
+        root->sectorsize = sectorsize;
+        root->nodesize = nodesize;
+        root->leafsize = leafsize;
+        root->stripesize = stripesize;
+        root->ref_cows = 0;
+        root->track_dirty = 0;
+        root->fs_info = fs_info;
+        root->objectid = objectid;
+        root->last_trans = 0;
+        root->highest_inode = 0;
+        root->last_inode_alloc = 0;
+        root->name = NULL;
+        root->in_sysfs = 0;
+        INIT_LIST_HEAD(&root->dirty_list);
+        INIT_LIST_HEAD(&root->orphan_list);
+        INIT_LIST_HEAD(&root->dead_list);
+        spin_lock_init(&root->node_lock);
+        spin_lock_init(&root->list_lock);
+        mutex_init(&root->objectid_mutex);
+        mutex_init(&root->log_mutex);
+        extent_io_tree_init(&root->dirty_log_pages,
+                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+        btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+        root->ref_tree = &root->ref_tree_struct;
+        memset(&root->root_key, 0, sizeof(root->root_key));
+        memset(&root->root_item, 0, sizeof(root->root_item));
+        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+        memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+        root->defrag_trans_start = fs_info->generation;
+        init_completion(&root->kobj_unregister);
+        root->defrag_running = 0;
+        root->defrag_level = 0;
+        root->root_key.objectid = objectid;
+        root->anon_super.s_root = NULL;
+        root->anon_super.s_dev = 0;
+        INIT_LIST_HEAD(&root->anon_super.s_list);
+        INIT_LIST_HEAD(&root->anon_super.s_instances);
+        init_rwsem(&root->anon_super.s_umount);
+        return 0;
+}
+static int find_and_setup_root(struct btrfs_root *tree_root,
+                               struct btrfs_fs_info *fs_info,
+                               u64 objectid,
+                               struct btrfs_root *root)
+{
+        int ret;
+        u32 blocksize;
+        u64 generation;
+        __setup_root(tree_root->nodesize, tree_root->leafsize,
+                     tree_root->sectorsize, tree_root->stripesize,
+                     root, fs_info, objectid);
+        ret = btrfs_find_last_root(tree_root, objectid,
+                                   &root->root_item, &root->root_key);
+        BUG_ON(ret);
+        generation = btrfs_root_generation(&root->root_item);
+        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+                                     blocksize, generation);
+        BUG_ON(!root->node);
+        return 0;
+}
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info)
+{
+        struct extent_buffer *eb;
+        struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+        u64 start = 0;
+        u64 end = 0;
+        int ret;
+        if (!log_root_tree)
+                return 0;
+        while (1) {
+                ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
+                                    0, &start, &end, EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_dirty(&log_root_tree->dirty_log_pages,
+                                   start, end, GFP_NOFS);
+        }
+        eb = fs_info->log_root_tree->node;
+        WARN_ON(btrfs_header_level(eb) != 0);
+        WARN_ON(btrfs_header_nritems(eb) != 0);
+        ret = btrfs_free_reserved_extent(fs_info->tree_root,
+                                eb->start, eb->len);
+        BUG_ON(ret);
+        free_extent_buffer(eb);
+        kfree(fs_info->log_root_tree);
+        fs_info->log_root_tree = NULL;
+        return 0;
+}
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_root *root;
+        struct btrfs_root *tree_root = fs_info->tree_root;
+        root = kzalloc(sizeof(*root), GFP_NOFS);
+        if (!root)
+                return -ENOMEM;
+        __setup_root(tree_root->nodesize, tree_root->leafsize,
+                     tree_root->sectorsize, tree_root->stripesize,
+                     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+        root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+        root->ref_cows = 0;
+        root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
+                                            0, BTRFS_TREE_LOG_OBJECTID,
+                                            trans->transid, 0, 0, 0);
+        btrfs_set_header_nritems(root->node, 0);
+        btrfs_set_header_level(root->node, 0);
+        btrfs_set_header_bytenr(root->node, root->node->start);
+        btrfs_set_header_generation(root->node, trans->transid);
+        btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
+        write_extent_buffer(root->node, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(root->node),
+                            BTRFS_FSID_SIZE);
+        btrfs_mark_buffer_dirty(root->node);
+        btrfs_tree_unlock(root->node);
+        fs_info->log_root_tree = root;
+        return 0;
+}
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+                                               struct btrfs_key *location)
+{
+        struct btrfs_root *root;
+        struct btrfs_fs_info *fs_info = tree_root->fs_info;
+        struct btrfs_path *path;
+        struct extent_buffer *l;
+        u64 highest_inode;
+        u64 generation;
+        u32 blocksize;
+        int ret = 0;
+        root = kzalloc(sizeof(*root), GFP_NOFS);
+        if (!root)
+                return ERR_PTR(-ENOMEM);
+        if (location->offset == (u64)-1) {
+                ret = find_and_setup_root(tree_root, fs_info,
+                                          location->objectid, root);
+                if (ret) {
+                        kfree(root);
+                        return ERR_PTR(ret);
+                }
+                goto insert;
+        }
+        __setup_root(tree_root->nodesize, tree_root->leafsize,
+                     tree_root->sectorsize, tree_root->stripesize,
+                     root, fs_info, location->objectid);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+        if (ret != 0) {
+                if (ret > 0)
+                        ret = -ENOENT;
+                goto out;
+        }
+        l = path->nodes[0];
+        read_extent_buffer(l, &root->root_item,
+               btrfs_item_ptr_offset(l, path->slots[0]),
+               sizeof(root->root_item));
+        memcpy(&root->root_key, location, sizeof(*location));
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        if (ret) {
+                kfree(root);
+                return ERR_PTR(ret);
+        }
+        generation = btrfs_root_generation(&root->root_item);
+        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+                                     blocksize, generation);
+        BUG_ON(!root->node);
+insert:
+        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+                root->ref_cows = 1;
+                ret = btrfs_find_highest_inode(root, &highest_inode);
+                if (ret == 0) {
+                        root->highest_inode = highest_inode;
+                        root->last_inode_alloc = highest_inode;
+                }
+        }
+        return root;
+}
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_objectid)
+{
+        struct btrfs_root *root;
+        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+                return fs_info->tree_root;
+        if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+                return fs_info->extent_root;
+        root = radix_tree_lookup(&fs_info->fs_roots_radix,
+                                 (unsigned long)root_objectid);
+        return root;
+}
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+                                              struct btrfs_key *location)
+{
+        struct btrfs_root *root;
+        int ret;
+        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+                return fs_info->tree_root;
+        if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+                return fs_info->extent_root;
+        if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+                return fs_info->chunk_root;
+        if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+                return fs_info->dev_root;
+        if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+                return fs_info->csum_root;
+        root = radix_tree_lookup(&fs_info->fs_roots_radix,
+                                 (unsigned long)location->objectid);
+        if (root)
+                return root;
+        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+        if (IS_ERR(root))
+                return root;
+        set_anon_super(&root->anon_super, NULL);
+        ret = radix_tree_insert(&fs_info->fs_roots_radix,
+                                (unsigned long)root->root_key.objectid,
+                                root);
+        if (ret) {
+                free_extent_buffer(root->node);
+                kfree(root);
+                return ERR_PTR(ret);
+        }
+        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+                ret = btrfs_find_dead_roots(fs_info->tree_root,
+                                            root->root_key.objectid, root);
+                BUG_ON(ret);
+                btrfs_orphan_cleanup(root);
+        }
+        return root;
+}
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_key *location,
+                                      const char *name, int namelen)
+{
+        struct btrfs_root *root;
+        int ret;
+        root = btrfs_read_fs_root_no_name(fs_info, location);
+        if (!root)
+                return NULL;
+        if (root->in_sysfs)
+                return root;
+        ret = btrfs_set_root_name(root, name, namelen);
+        if (ret) {
+                free_extent_buffer(root->node);
+                kfree(root);
+                return ERR_PTR(ret);
+        }
+#if 0
+        ret = btrfs_sysfs_add_root(root);
+        if (ret) {
+                free_extent_buffer(root->node);
+                kfree(root->name);
+                kfree(root);
+                return ERR_PTR(ret);
+        }
+#endif
+        root->in_sysfs = 1;
+        return root;
+}
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+        struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+        int ret = 0;
+        struct list_head *cur;
+        struct btrfs_device *device;
+        struct backing_dev_info *bdi;
+#if 0
+        if ((bdi_bits & (1 << BDI_write_congested)) &&
+            btrfs_congested_async(info, 0))
+                return 1;
+#endif
+        list_for_each(cur, &info->fs_devices->devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (!device->bdev)
+                        continue;
+                bdi = blk_get_backing_dev_info(device->bdev);
+                if (bdi && bdi_congested(bdi, bdi_bits)) {
+                        ret = 1;
+                        break;
+                }
+        }
+        return ret;
+}
+/*
+ * this unplugs every device on the box, and it is only used when page
+ * is null
+ */
+static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+        struct list_head *cur;
+        struct btrfs_device *device;
+        struct btrfs_fs_info *info;
+        info = (struct btrfs_fs_info *)bdi->unplug_io_data;
+        list_for_each(cur, &info->fs_devices->devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (!device->bdev)
+                        continue;
+                bdi = blk_get_backing_dev_info(device->bdev);
+                if (bdi->unplug_io_fn)
+                        bdi->unplug_io_fn(bdi, page);
+        }
+}
+static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+        struct inode *inode;
+        struct extent_map_tree *em_tree;
+        struct extent_map *em;
+        struct address_space *mapping;
+        u64 offset;
+        /* the generic O_DIRECT read code does this */
+        if (1 || !page) {
+                __unplug_io_fn(bdi, page);
+                return;
+        }
+        /*
+         * page->mapping may change at any time.  Get a consistent copy
+         * and use that for everything below
+         */
+        smp_mb();
+        mapping = page->mapping;
+        if (!mapping)
+                return;
+        inode = mapping->host;
+        /*
+         * don't do the expensive searching for a small number of
+         * devices
+         */
+        if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
+                __unplug_io_fn(bdi, page);
+                return;
+        }
+        offset = page_offset(page);
+        em_tree = &BTRFS_I(inode)->extent_tree;
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+        spin_unlock(&em_tree->lock);
+        if (!em) {
+                __unplug_io_fn(bdi, page);
+                return;
+        }
+        if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+                free_extent_map(em);
+                __unplug_io_fn(bdi, page);
+                return;
+        }
+        offset = offset - em->start;
+        btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
+                          em->block_start + offset, page);
+        free_extent_map(em);
+}
+static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+{
+        bdi_init(bdi);
+        bdi->ra_pages   = default_backing_dev_info.ra_pages;
+        bdi->state              = 0;
+        bdi->capabilities       = default_backing_dev_info.capabilities;
+        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
+        bdi->unplug_io_data     = info;
+        bdi->congested_fn       = btrfs_congested_fn;
+        bdi->congested_data     = info;
+        return 0;
+}
+static int bio_ready_for_csum(struct bio *bio)
+{
+        u64 length = 0;
+        u64 buf_len = 0;
+        u64 start = 0;
+        struct page *page;
+        struct extent_io_tree *io_tree = NULL;
+        struct btrfs_fs_info *info = NULL;
+        struct bio_vec *bvec;
+        int i;
+        int ret;
+        bio_for_each_segment(bvec, bio, i) {
+                page = bvec->bv_page;
+                if (page->private == EXTENT_PAGE_PRIVATE) {
+                        length += bvec->bv_len;
+                        continue;
+                }
+                if (!page->private) {
+                        length += bvec->bv_len;
+                        continue;
+                }
+                length = bvec->bv_len;
+                buf_len = page->private >> 2;
+                start = page_offset(page) + bvec->bv_offset;
+                io_tree = &BTRFS_I(page->mapping->host)->io_tree;
+                info = BTRFS_I(page->mapping->host)->root->fs_info;
+        }
+        /* are we fully contained in this bio? */
+        if (buf_len <= length)
+                return 1;
+        ret = extent_range_uptodate(io_tree, start + length,
+                                    start + buf_len - 1);
+        if (ret == 1)
+                return ret;
+        return ret;
+}
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions.  This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
+{
+        struct bio *bio;
+        struct end_io_wq *end_io_wq;
+        struct btrfs_fs_info *fs_info;
+        int error;
+        end_io_wq = container_of(work, struct end_io_wq, work);
+        bio = end_io_wq->bio;
+        fs_info = end_io_wq->info;
+        /* metadata bio reads are special because the whole tree block must
+         * be checksummed at once.  This makes sure the entire block is in
+         * ram and up to date before trying to verify things.  For
+         * blocksize <= pagesize, it is basically a noop
+         */
+        if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
+            !bio_ready_for_csum(bio)) {
+                btrfs_queue_worker(&fs_info->endio_meta_workers,
+                                   &end_io_wq->work);
+                return;
+        }
+        error = end_io_wq->error;
+        bio->bi_private = end_io_wq->private;
+        bio->bi_end_io = end_io_wq->end_io;
+        kfree(end_io_wq);
+        bio_endio(bio, error);
+}
+static int cleaner_kthread(void *arg)
+{
+        struct btrfs_root *root = arg;
+        do {
+                smp_mb();
+                if (root->fs_info->closing)
+                        break;
+                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+                mutex_lock(&root->fs_info->cleaner_mutex);
+                btrfs_clean_old_snapshots(root);
+                mutex_unlock(&root->fs_info->cleaner_mutex);
+                if (freezing(current)) {
+                        refrigerator();
+                } else {
+                        smp_mb();
+                        if (root->fs_info->closing)
+                                break;
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        schedule();
+                        __set_current_state(TASK_RUNNING);
+                }
+        } while (!kthread_should_stop());
+        return 0;
+}
+static int transaction_kthread(void *arg)
+{
+        struct btrfs_root *root = arg;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_transaction *cur;
+        unsigned long now;
+        unsigned long delay;
+        int ret;
+        do {
+                smp_mb();
+                if (root->fs_info->closing)
+                        break;
+                delay = HZ * 30;
+                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+                mutex_lock(&root->fs_info->transaction_kthread_mutex);
+                if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
+                        printk(KERN_INFO "btrfs: total reference cache "
+                               "size %llu\n",
+                               root->fs_info->total_ref_cache_size);
+                }
+                mutex_lock(&root->fs_info->trans_mutex);
+                cur = root->fs_info->running_transaction;
+                if (!cur) {
+                        mutex_unlock(&root->fs_info->trans_mutex);
+                        goto sleep;
+                }
+                now = get_seconds();
+                if (now < cur->start_time || now - cur->start_time < 30) {
+                        mutex_unlock(&root->fs_info->trans_mutex);
+                        delay = HZ * 5;
+                        goto sleep;
+                }
+                mutex_unlock(&root->fs_info->trans_mutex);
+                trans = btrfs_start_transaction(root, 1);
+                ret = btrfs_commit_transaction(trans, root);
+sleep:
+                wake_up_process(root->fs_info->cleaner_kthread);
+                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+                if (freezing(current)) {
+                        refrigerator();
+                } else {
+                        if (root->fs_info->closing)
+                                break;
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(delay);
+                        __set_current_state(TASK_RUNNING);
+                }
+        } while (!kthread_should_stop());
+        return 0;
+}
+struct btrfs_root *open_ctree(struct super_block *sb,
+                              struct btrfs_fs_devices *fs_devices,
+                              char *options)
+{
+        u32 sectorsize;
+        u32 nodesize;
+        u32 leafsize;
+        u32 blocksize;
+        u32 stripesize;
+        u64 generation;
+        u64 features;
+        struct btrfs_key location;
+        struct buffer_head *bh;
+        struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
+                                                 GFP_NOFS);
+        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
+                                                 GFP_NOFS);
+        struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
+                                               GFP_NOFS);
+        struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
+                                                GFP_NOFS);
+        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
+                                                GFP_NOFS);
+        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
+                                              GFP_NOFS);
+        struct btrfs_root *log_tree_root;
+        int ret;
+        int err = -EINVAL;
+        struct btrfs_super_block *disk_super;
+        if (!extent_root || !tree_root || !fs_info ||
+            !chunk_root || !dev_root || !csum_root) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+        INIT_LIST_HEAD(&fs_info->trans_list);
+        INIT_LIST_HEAD(&fs_info->dead_roots);
+        INIT_LIST_HEAD(&fs_info->hashers);
+        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+        spin_lock_init(&fs_info->hash_lock);
+        spin_lock_init(&fs_info->delalloc_lock);
+        spin_lock_init(&fs_info->new_trans_lock);
+        spin_lock_init(&fs_info->ref_cache_lock);
+        init_completion(&fs_info->kobj_unregister);
+        fs_info->tree_root = tree_root;
+        fs_info->extent_root = extent_root;
+        fs_info->csum_root = csum_root;
+        fs_info->chunk_root = chunk_root;
+        fs_info->dev_root = dev_root;
+        fs_info->fs_devices = fs_devices;
+        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+        INIT_LIST_HEAD(&fs_info->space_info);
+        btrfs_mapping_init(&fs_info->mapping_tree);
+        atomic_set(&fs_info->nr_async_submits, 0);
+        atomic_set(&fs_info->async_delalloc_pages, 0);
+        atomic_set(&fs_info->async_submit_draining, 0);
+        atomic_set(&fs_info->nr_async_bios, 0);
+        atomic_set(&fs_info->throttles, 0);
+        atomic_set(&fs_info->throttle_gen, 0);
+        fs_info->sb = sb;
+        fs_info->max_extent = (u64)-1;
+        fs_info->max_inline = 8192 * 1024;
+        setup_bdi(fs_info, &fs_info->bdi);
+        fs_info->btree_inode = new_inode(sb);
+        fs_info->btree_inode->i_ino = 1;
+        fs_info->btree_inode->i_nlink = 1;
+        fs_info->thread_pool_size = min_t(unsigned long,
+                                          num_online_cpus() + 2, 8);
+        INIT_LIST_HEAD(&fs_info->ordered_extents);
+        spin_lock_init(&fs_info->ordered_extent_lock);
+        sb->s_blocksize = 4096;
+        sb->s_blocksize_bits = blksize_bits(4096);
+        /*
+         * we set the i_size on the btree inode to the max possible int.
+         * the real end of the address space is determined by all of
+         * the devices in the system
+         */
+        fs_info->btree_inode->i_size = OFFSET_MAX;
+        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+                             fs_info->btree_inode->i_mapping,
+                             GFP_NOFS);
+        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+                             GFP_NOFS);
+        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+        spin_lock_init(&fs_info->block_group_cache_lock);
+        fs_info->block_group_cache_tree.rb_node = NULL;
+        extent_io_tree_init(&fs_info->pinned_extents,
+                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+        extent_io_tree_init(&fs_info->pending_del,
+                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+        extent_io_tree_init(&fs_info->extent_ins,
+                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+        fs_info->do_barriers = 1;
+        INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
+        btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
+        btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
+        BTRFS_I(fs_info->btree_inode)->root = tree_root;
+        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+               sizeof(struct btrfs_key));
+        insert_inode_hash(fs_info->btree_inode);
+        mutex_init(&fs_info->trans_mutex);
+        mutex_init(&fs_info->tree_log_mutex);
+        mutex_init(&fs_info->drop_mutex);
+        mutex_init(&fs_info->extent_ins_mutex);
+        mutex_init(&fs_info->pinned_mutex);
+        mutex_init(&fs_info->chunk_mutex);
+        mutex_init(&fs_info->transaction_kthread_mutex);
+        mutex_init(&fs_info->cleaner_mutex);
+        mutex_init(&fs_info->volume_mutex);
+        mutex_init(&fs_info->tree_reloc_mutex);
+        init_waitqueue_head(&fs_info->transaction_throttle);
+        init_waitqueue_head(&fs_info->transaction_wait);
+        init_waitqueue_head(&fs_info->async_submit_wait);
+        init_waitqueue_head(&fs_info->tree_log_wait);
+        atomic_set(&fs_info->tree_log_commit, 0);
+        atomic_set(&fs_info->tree_log_writers, 0);
+        fs_info->tree_log_transid = 0;
+        __setup_root(4096, 4096, 4096, 4096, tree_root,
+                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
+        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
+        if (!bh)
+                goto fail_iput;
+        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
+               sizeof(fs_info->super_for_commit));
+        brelse(bh);
+        memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+        disk_super = &fs_info->super_copy;
+        if (!btrfs_super_root(disk_super))
+                goto fail_iput;
+        ret = btrfs_parse_options(tree_root, options);
+        if (ret) {
+                err = ret;
+                goto fail_iput;
+        }
+        features = btrfs_super_incompat_flags(disk_super) &
+                ~BTRFS_FEATURE_INCOMPAT_SUPP;
+        if (features) {
+                printk(KERN_ERR "BTRFS: couldn't mount because of "
+                       "unsupported optional features (%Lx).\n",
+                       features);
+                err = -EINVAL;
+                goto fail_iput;
+        }
+        features = btrfs_super_compat_ro_flags(disk_super) &
+                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
+        if (!(sb->s_flags & MS_RDONLY) && features) {
+                printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
+                       "unsupported option features (%Lx).\n",
+                       features);
+                err = -EINVAL;
+                goto fail_iput;
+        }
+        /*
+         * we need to start all the end_io workers up front because the
+         * queue work function gets called at interrupt time, and so it
+         * cannot dynamically grow.
+         */
+        btrfs_init_workers(&fs_info->workers, "worker",
+                           fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+                           fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->submit_workers, "submit",
+                           min_t(u64, fs_devices->num_devices,
+                           fs_info->thread_pool_size));
+        /* a higher idle thresh on the submit workers makes it much more
+         * likely that bios will be send down in a sane order to the
+         * devices
+         */
+        fs_info->submit_workers.idle_thresh = 64;
+        fs_info->workers.idle_thresh = 16;
+        fs_info->workers.ordered = 1;
+        fs_info->delalloc_workers.idle_thresh = 2;
+        fs_info->delalloc_workers.ordered = 1;
+        btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+        btrfs_init_workers(&fs_info->endio_workers, "endio",
+                           fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+                           fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->endio_meta_write_workers,
+                           "endio-meta-write", fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+                           fs_info->thread_pool_size);
+        /*
+         * endios are largely parallel and should have a very
+         * low idle thresh
+         */
+        fs_info->endio_workers.idle_thresh = 4;
+        fs_info->endio_write_workers.idle_thresh = 64;
+        fs_info->endio_meta_write_workers.idle_thresh = 64;
+        btrfs_start_workers(&fs_info->workers, 1);
+        btrfs_start_workers(&fs_info->submit_workers, 1);
+        btrfs_start_workers(&fs_info->delalloc_workers, 1);
+        btrfs_start_workers(&fs_info->fixup_workers, 1);
+        btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+        btrfs_start_workers(&fs_info->endio_meta_workers,
+                            fs_info->thread_pool_size);
+        btrfs_start_workers(&fs_info->endio_meta_write_workers,
+                            fs_info->thread_pool_size);
+        btrfs_start_workers(&fs_info->endio_write_workers,
+                            fs_info->thread_pool_size);
+        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+                                    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+        nodesize = btrfs_super_nodesize(disk_super);
+        leafsize = btrfs_super_leafsize(disk_super);
+        sectorsize = btrfs_super_sectorsize(disk_super);
+        stripesize = btrfs_super_stripesize(disk_super);
+        tree_root->nodesize = nodesize;
+        tree_root->leafsize = leafsize;
+        tree_root->sectorsize = sectorsize;
+        tree_root->stripesize = stripesize;
+        sb->s_blocksize = sectorsize;
+        sb->s_blocksize_bits = blksize_bits(sectorsize);
+        if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+                    sizeof(disk_super->magic))) {
+                printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+                goto fail_sb_buffer;
+        }
+        mutex_lock(&fs_info->chunk_mutex);
+        ret = btrfs_read_sys_array(tree_root);
+        mutex_unlock(&fs_info->chunk_mutex);
+        if (ret) {
+                printk(KERN_WARNING "btrfs: failed to read the system "
+                       "array on %s\n", sb->s_id);
+                goto fail_sys_array;
+        }
+        blocksize = btrfs_level_size(tree_root,
+                                     btrfs_super_chunk_root_level(disk_super));
+        generation = btrfs_super_chunk_root_generation(disk_super);
+        __setup_root(nodesize, leafsize, sectorsize, stripesize,
+                     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+        chunk_root->node = read_tree_block(chunk_root,
+                                           btrfs_super_chunk_root(disk_super),
+                                           blocksize, generation);
+        BUG_ON(!chunk_root->node);
+        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+           BTRFS_UUID_SIZE);
+        mutex_lock(&fs_info->chunk_mutex);
+        ret = btrfs_read_chunk_tree(chunk_root);
+        mutex_unlock(&fs_info->chunk_mutex);
+        if (ret) {
+                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+                       sb->s_id);
+                goto fail_chunk_root;
+        }
+        btrfs_close_extra_devices(fs_devices);
+        blocksize = btrfs_level_size(tree_root,
+                                     btrfs_super_root_level(disk_super));
+        generation = btrfs_super_generation(disk_super);
+        tree_root->node = read_tree_block(tree_root,
+                                          btrfs_super_root(disk_super),
+                                          blocksize, generation);
+        if (!tree_root->node)
+                goto fail_chunk_root;
+        ret = find_and_setup_root(tree_root, fs_info,
+                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
+        if (ret)
+                goto fail_tree_root;
+        extent_root->track_dirty = 1;
+        ret = find_and_setup_root(tree_root, fs_info,
+                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
+        dev_root->track_dirty = 1;
+        if (ret)
+                goto fail_extent_root;
+        ret = find_and_setup_root(tree_root, fs_info,
+                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
+        if (ret)
+                goto fail_extent_root;
+        csum_root->track_dirty = 1;
+        btrfs_read_block_groups(extent_root);
+        fs_info->generation = generation;
+        fs_info->last_trans_committed = generation;
+        fs_info->data_alloc_profile = (u64)-1;
+        fs_info->metadata_alloc_profile = (u64)-1;
+        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+                                               "btrfs-cleaner");
+        if (!fs_info->cleaner_kthread)
+                goto fail_csum_root;
+        fs_info->transaction_kthread = kthread_run(transaction_kthread,
+                                                   tree_root,
+                                                   "btrfs-transaction");
+        if (!fs_info->transaction_kthread)
+                goto fail_cleaner;
+        if (btrfs_super_log_root(disk_super) != 0) {
+                u64 bytenr = btrfs_super_log_root(disk_super);
+                if (fs_devices->rw_devices == 0) {
+                        printk(KERN_WARNING "Btrfs log replay required "
+                               "on RO media\n");
+                        err = -EIO;
+                        goto fail_trans_kthread;
+                }
+                blocksize =
+                     btrfs_level_size(tree_root,
+                                      btrfs_super_log_root_level(disk_super));
+                log_tree_root = kzalloc(sizeof(struct btrfs_root),
+                                                      GFP_NOFS);
+                __setup_root(nodesize, leafsize, sectorsize, stripesize,
+                             log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+                log_tree_root->node = read_tree_block(tree_root, bytenr,
+                                                      blocksize,
+                                                      generation + 1);
+                ret = btrfs_recover_log_trees(log_tree_root);
+                BUG_ON(ret);
+                if (sb->s_flags & MS_RDONLY) {
+                        ret =  btrfs_commit_super(tree_root);
+                        BUG_ON(ret);
+                }
+        }
+        if (!(sb->s_flags & MS_RDONLY)) {
+                ret = btrfs_cleanup_reloc_trees(tree_root);
+                BUG_ON(ret);
+        }
+        location.objectid = BTRFS_FS_TREE_OBJECTID;
+        location.type = BTRFS_ROOT_ITEM_KEY;
+        location.offset = (u64)-1;
+        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+        if (!fs_info->fs_root)
+                goto fail_trans_kthread;
+        return tree_root;
+fail_trans_kthread:
+        kthread_stop(fs_info->transaction_kthread);
+fail_cleaner:
+        kthread_stop(fs_info->cleaner_kthread);
+        /*
+         * make sure we're done with the btree inode before we stop our
+         * kthreads
+         */
+        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+fail_csum_root:
+        free_extent_buffer(csum_root->node);
+fail_extent_root:
+        free_extent_buffer(extent_root->node);
+fail_tree_root:
+        free_extent_buffer(tree_root->node);
+fail_chunk_root:
+        free_extent_buffer(chunk_root->node);
+fail_sys_array:
+        free_extent_buffer(dev_root->node);
+fail_sb_buffer:
+        btrfs_stop_workers(&fs_info->fixup_workers);
+        btrfs_stop_workers(&fs_info->delalloc_workers);
+        btrfs_stop_workers(&fs_info->workers);
+        btrfs_stop_workers(&fs_info->endio_workers);
+        btrfs_stop_workers(&fs_info->endio_meta_workers);
+        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+        btrfs_stop_workers(&fs_info->endio_write_workers);
+        btrfs_stop_workers(&fs_info->submit_workers);
+fail_iput:
+        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+        iput(fs_info->btree_inode);
+fail:
+        btrfs_close_devices(fs_info->fs_devices);
+        btrfs_mapping_tree_free(&fs_info->mapping_tree);
+        kfree(extent_root);
+        kfree(tree_root);
+        bdi_destroy(&fs_info->bdi);
+        kfree(fs_info);
+        kfree(chunk_root);
+        kfree(dev_root);
+        kfree(csum_root);
+        return ERR_PTR(err);
+}
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+        char b[BDEVNAME_SIZE];
+        if (uptodate) {
+                set_buffer_uptodate(bh);
+        } else {
+                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+                        printk(KERN_WARNING "lost page write due to "
+                                        "I/O error on %s\n",
+                                       bdevname(bh->b_bdev, b));
+                }
+                /* note, we dont' set_buffer_write_io_error because we have
+                 * our own ways of dealing with the IO errors
+                 */
+                clear_buffer_uptodate(bh);
+        }
+        unlock_buffer(bh);
+        put_bh(bh);
+}
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+        struct buffer_head *bh;
+        struct buffer_head *latest = NULL;
+        struct btrfs_super_block *super;
+        int i;
+        u64 transid = 0;
+        u64 bytenr;
+        /* we would like to check all the supers, but that would make
+         * a btrfs mount succeed after a mkfs from a different FS.
+         * So, we need to add a special mount option to scan for
+         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+         */
+        for (i = 0; i < 1; i++) {
+                bytenr = btrfs_sb_offset(i);
+                if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+                        break;
+                bh = __bread(bdev, bytenr / 4096, 4096);
+                if (!bh)
+                        continue;
+                super = (struct btrfs_super_block *)bh->b_data;
+                if (btrfs_super_bytenr(super) != bytenr ||
+                    strncmp((char *)(&super->magic), BTRFS_MAGIC,
+                            sizeof(super->magic))) {
+                        brelse(bh);
+                        continue;
+                }
+                if (!latest || btrfs_super_generation(super) > transid) {
+                        brelse(latest);
+                        latest = bh;
+                        transid = btrfs_super_generation(super);
+                } else {
+                        brelse(bh);
+                }
+        }
+        return latest;
+}
+static int write_dev_supers(struct btrfs_device *device,
+                            struct btrfs_super_block *sb,
+                            int do_barriers, int wait, int max_mirrors)
+{
+        struct buffer_head *bh;
+        int i;
+        int ret;
+        int errors = 0;
+        u32 crc;
+        u64 bytenr;
+        int last_barrier = 0;
+        if (max_mirrors == 0)
+                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+        /* make sure only the last submit_bh does a barrier */
+        if (do_barriers) {
+                for (i = 0; i < max_mirrors; i++) {
+                        bytenr = btrfs_sb_offset(i);
+                        if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+                            device->total_bytes)
+                                break;
+                        last_barrier = i;
+                }
+        }
+        for (i = 0; i < max_mirrors; i++) {
+                bytenr = btrfs_sb_offset(i);
+                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+                        break;
+                if (wait) {
+                        bh = __find_get_block(device->bdev, bytenr / 4096,
+                                              BTRFS_SUPER_INFO_SIZE);
+                        BUG_ON(!bh);
+                        brelse(bh);
+                        wait_on_buffer(bh);
+                        if (buffer_uptodate(bh)) {
+                                brelse(bh);
+                                continue;
+                        }
+                } else {
+                        btrfs_set_super_bytenr(sb, bytenr);
+                        crc = ~(u32)0;
+                        crc = btrfs_csum_data(NULL, (char *)sb +
+                                              BTRFS_CSUM_SIZE, crc,
+                                              BTRFS_SUPER_INFO_SIZE -
+                                              BTRFS_CSUM_SIZE);
+                        btrfs_csum_final(crc, sb->csum);
+                        bh = __getblk(device->bdev, bytenr / 4096,
+                                      BTRFS_SUPER_INFO_SIZE);
+                        memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+                        set_buffer_uptodate(bh);
+                        get_bh(bh);
+                        lock_buffer(bh);
+                        bh->b_end_io = btrfs_end_buffer_write_sync;
+                }
+                if (i == last_barrier && do_barriers && device->barriers) {
+                        ret = submit_bh(WRITE_BARRIER, bh);
+                        if (ret == -EOPNOTSUPP) {
+                                printk("btrfs: disabling barriers on dev %s\n",
+                                       device->name);
+                                set_buffer_uptodate(bh);
+                                device->barriers = 0;
+                                get_bh(bh);
+                                lock_buffer(bh);
+                                ret = submit_bh(WRITE, bh);
+                        }
+                } else {
+                        ret = submit_bh(WRITE, bh);
+                }
+                if (!ret && wait) {
+                        wait_on_buffer(bh);
+                        if (!buffer_uptodate(bh))
+                                errors++;
+                } else if (ret) {
+                        errors++;
+                }
+                if (wait)
+                        brelse(bh);
+        }
+        return errors < i ? 0 : -1;
+}
+int write_all_supers(struct btrfs_root *root, int max_mirrors)
+{
+        struct list_head *cur;
+        struct list_head *head = &root->fs_info->fs_devices->devices;
+        struct btrfs_device *dev;
+        struct btrfs_super_block *sb;
+        struct btrfs_dev_item *dev_item;
+        int ret;
+        int do_barriers;
+        int max_errors;
+        int total_errors = 0;
+        u64 flags;
+        max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+        do_barriers = !btrfs_test_opt(root, NOBARRIER);
+        sb = &root->fs_info->super_for_commit;
+        dev_item = &sb->dev_item;
+        list_for_each(cur, head) {
+                dev = list_entry(cur, struct btrfs_device, dev_list);
+                if (!dev->bdev) {
+                        total_errors++;
+                        continue;
+                }
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                btrfs_set_stack_device_generation(dev_item, 0);
+                btrfs_set_stack_device_type(dev_item, dev->type);
+                btrfs_set_stack_device_id(dev_item, dev->devid);
+                btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+                btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+                btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+                memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+                memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+                flags = btrfs_super_flags(sb);
+                btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+                ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
+                if (ret)
+                        total_errors++;
+        }
+        if (total_errors > max_errors) {
+                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+                       total_errors);
+                BUG();
+        }
+        total_errors = 0;
+        list_for_each(cur, head) {
+                dev = list_entry(cur, struct btrfs_device, dev_list);
+                if (!dev->bdev)
+                        continue;
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+                if (ret)
+                        total_errors++;
+        }
+        if (total_errors > max_errors) {
+                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+                       total_errors);
+                BUG();
+        }
+        return 0;
+}
+int write_ctree_super(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, int max_mirrors)
+{
+        int ret;
+        ret = write_all_supers(root, max_mirrors);
+        return ret;
+}
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+        radix_tree_delete(&fs_info->fs_roots_radix,
+                          (unsigned long)root->root_key.objectid);
+        if (root->anon_super.s_dev) {
+                down_write(&root->anon_super.s_umount);
+                kill_anon_super(&root->anon_super);
+        }
+        if (root->node)
+                free_extent_buffer(root->node);
+        if (root->commit_root)
+                free_extent_buffer(root->commit_root);
+        kfree(root->name);
+        kfree(root);
+        return 0;
+}
+static int del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+        int ret;
+        struct btrfs_root *gang[8];
+        int i;
+        while (1) {
+                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+                                             (void **)gang, 0,
+                                             ARRAY_SIZE(gang));
+                if (!ret)
+                        break;
+                for (i = 0; i < ret; i++)
+                        btrfs_free_fs_root(fs_info, gang[i]);
+        }
+        return 0;
+}
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+        u64 root_objectid = 0;
+        struct btrfs_root *gang[8];
+        int i;
+        int ret;
+        while (1) {
+                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+                                             (void **)gang, root_objectid,
+                                             ARRAY_SIZE(gang));
+                if (!ret)
+                        break;
+                for (i = 0; i < ret; i++) {
+                        root_objectid = gang[i]->root_key.objectid;
+                        ret = btrfs_find_dead_roots(fs_info->tree_root,
+                                                    root_objectid, gang[i]);
+                        BUG_ON(ret);
+                        btrfs_orphan_cleanup(gang[i]);
+                }
+                root_objectid++;
+        }
+        return 0;
+}
+int btrfs_commit_super(struct btrfs_root *root)
+{
+        struct btrfs_trans_handle *trans;
+        int ret;
+        mutex_lock(&root->fs_info->cleaner_mutex);
+        btrfs_clean_old_snapshots(root);
+        mutex_unlock(&root->fs_info->cleaner_mutex);
+        trans = btrfs_start_transaction(root, 1);
+        ret = btrfs_commit_transaction(trans, root);
+        BUG_ON(ret);
+        /* run commit again to drop the original snapshot */
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_commit_transaction(trans, root);
+        ret = btrfs_write_and_wait_transaction(NULL, root);
+        BUG_ON(ret);
+        ret = write_ctree_super(NULL, root, 0);
+        return ret;
+}
+int close_ctree(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        int ret;
+        fs_info->closing = 1;
+        smp_mb();
+        kthread_stop(root->fs_info->transaction_kthread);
+        kthread_stop(root->fs_info->cleaner_kthread);
+        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+                ret =  btrfs_commit_super(root);
+                if (ret)
+                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+        }
+        if (fs_info->delalloc_bytes) {
+                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
+                       fs_info->delalloc_bytes);
+        }
+        if (fs_info->total_ref_cache_size) {
+                printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
+                       (unsigned long long)fs_info->total_ref_cache_size);
+        }
+        if (fs_info->extent_root->node)
+                free_extent_buffer(fs_info->extent_root->node);
+        if (fs_info->tree_root->node)
+                free_extent_buffer(fs_info->tree_root->node);
+        if (root->fs_info->chunk_root->node)
+                free_extent_buffer(root->fs_info->chunk_root->node);
+        if (root->fs_info->dev_root->node)
+                free_extent_buffer(root->fs_info->dev_root->node);
+        if (root->fs_info->csum_root->node)
+                free_extent_buffer(root->fs_info->csum_root->node);
+        btrfs_free_block_groups(root->fs_info);
+        del_fs_roots(fs_info);
+        iput(fs_info->btree_inode);
+        btrfs_stop_workers(&fs_info->fixup_workers);
+        btrfs_stop_workers(&fs_info->delalloc_workers);
+        btrfs_stop_workers(&fs_info->workers);
+        btrfs_stop_workers(&fs_info->endio_workers);
+        btrfs_stop_workers(&fs_info->endio_meta_workers);
+        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+        btrfs_stop_workers(&fs_info->endio_write_workers);
+        btrfs_stop_workers(&fs_info->submit_workers);
+#if 0
+        while (!list_empty(&fs_info->hashers)) {
+                struct btrfs_hasher *hasher;
+                hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
+                                    hashers);
+                list_del(&hasher->hashers);
+                crypto_free_hash(&fs_info->hash_tfm);
+                kfree(hasher);
+        }
+#endif
+        btrfs_close_devices(fs_info->fs_devices);
+        btrfs_mapping_tree_free(&fs_info->mapping_tree);
+        bdi_destroy(&fs_info->bdi);
+        kfree(fs_info->extent_root);
+        kfree(fs_info->tree_root);
+        kfree(fs_info->chunk_root);
+        kfree(fs_info->dev_root);
+        kfree(fs_info->csum_root);
+        return 0;
+}
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
+{
+        int ret;
+        struct inode *btree_inode = buf->first_page->mapping->host;
+        ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+        if (!ret)
+                return ret;
+        ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+                                    parent_transid);
+        return !ret;
+}
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
+{
+        struct inode *btree_inode = buf->first_page->mapping->host;
+        return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
+                                          buf);
+}
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+        u64 transid = btrfs_header_generation(buf);
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        WARN_ON(!btrfs_tree_locked(buf));
+        if (transid != root->fs_info->generation) {
+                printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+                       "found %llu running %llu\n",
+                        (unsigned long long)buf->start,
+                        (unsigned long long)transid,
+                        (unsigned long long)root->fs_info->generation);
+                WARN_ON(1);
+        }
+        set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+}
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+        /*
+         * looks as though older kernels can get into trouble with
+         * this code, they end up stuck in balance_dirty_pages forever
+         */
+        struct extent_io_tree *tree;
+        u64 num_dirty;
+        u64 start = 0;
+        unsigned long thresh = 32 * 1024 * 1024;
+        tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+        if (current_is_pdflush() || current->flags & PF_MEMALLOC)
+                return;
+        num_dirty = count_range_bits(tree, &start, (u64)-1,
+                                     thresh, EXTENT_DIRTY);
+        if (num_dirty > thresh) {
+                balance_dirty_pages_ratelimited_nr(
+                                   root->fs_info->btree_inode->i_mapping, 1);
+        }
+        return;
+}
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+{
+        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+        int ret;
+        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+        if (ret == 0)
+                buf->flags |= EXTENT_UPTODATE;
+        return ret;
+}
+int btree_lock_page_hook(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct extent_buffer *eb;
+        unsigned long len;
+        u64 bytenr = page_offset(page);
+        if (page->private == EXTENT_PAGE_PRIVATE)
+                goto out;
+        len = page->private >> 2;
+        eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+        if (!eb)
+                goto out;
+        btrfs_tree_lock(eb);
+        spin_lock(&root->fs_info->hash_lock);
+        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+        spin_unlock(&root->fs_info->hash_lock);
+        btrfs_tree_unlock(eb);
+        free_extent_buffer(eb);
+out:
+        lock_page(page);
+        return 0;
+}
+static struct extent_io_ops btree_extent_io_ops = {
+        .write_cache_pages_lock_hook = btree_lock_page_hook,
+        .readpage_end_io_hook = btree_readpage_end_io_hook,
+        .submit_bio_hook = btree_submit_bio_hook,
+        /* note we're sharing with inode.c for the merge bio hook */
+        .merge_bio_hook = btrfs_merge_bio_hook,
+};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..c0ff404c31b7
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DISKIO__
+#define __DISKIO__
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
+#define BTRFS_SUPER_MIRROR_MAX   3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+static inline u64 btrfs_sb_offset(int mirror)
+{
+        u64 start = 16 * 1024;
+        if (mirror)
+                return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+        return BTRFS_SUPER_INFO_OFFSET;
+}
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+                                      u32 blocksize, u64 parent_transid);
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                         u64 parent_transid);
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+                                                   u64 bytenr, u32 blocksize);
+int clean_tree_block(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root, struct extent_buffer *buf);
+struct btrfs_root *open_ctree(struct super_block *sb,
+                              struct btrfs_fs_devices *fs_devices,
+                              char *options);
+int close_ctree(struct btrfs_root *root);
+int write_ctree_super(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_commit_super(struct btrfs_root *root);
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+                                            u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_objectid);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_key *location,
+                                      const char *name, int namelen);
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+                                               struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+                                              struct btrfs_key *location);
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
+int btrfs_insert_dev_radix(struct btrfs_root *root,
+                           struct block_device *bdev,
+                           u64 device_id,
+                           u64 block_start,
+                           u64 num_blocks);
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
+int wait_on_tree_block_writeback(struct btrfs_root *root,
+                                 struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+void btrfs_csum_final(u32 crc, char *result);
+int btrfs_open_device(struct btrfs_device *dev);
+int btrfs_verify_block_csum(struct btrfs_root *root,
+                            struct extent_buffer *buf);
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+                        int metadata);
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+                        int rw, struct bio *bio, int mirror_num,
+                        unsigned long bio_flags,
+                        extent_submit_bio_hook_t *submit_bio_start,
+                        extent_submit_bio_hook_t *submit_bio_done);
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info);
+int btree_lock_page_hook(struct page *page);
+#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..85315d2c90de
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,203 @@
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+#include "compat.h"
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+                                                 parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+                                             parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
+static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+                           int connectable)
+{
+        struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+        struct inode *inode = dentry->d_inode;
+        int len = *max_len;
+        int type;
+        if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+            (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+                return 255;
+        len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
+        type = FILEID_BTRFS_WITHOUT_PARENT;
+        fid->objectid = BTRFS_I(inode)->location.objectid;
+        fid->root_objectid = BTRFS_I(inode)->root->objectid;
+        fid->gen = inode->i_generation;
+        if (connectable && !S_ISDIR(inode->i_mode)) {
+                struct inode *parent;
+                u64 parent_root_id;
+                spin_lock(&dentry->d_lock);
+                parent = dentry->d_parent->d_inode;
+                fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+                fid->parent_gen = parent->i_generation;
+                parent_root_id = BTRFS_I(parent)->root->objectid;
+                spin_unlock(&dentry->d_lock);
+                if (parent_root_id != fid->root_objectid) {
+                        fid->parent_root_objectid = parent_root_id;
+                        len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+                        type = FILEID_BTRFS_WITH_PARENT_ROOT;
+                } else {
+                        len = BTRFS_FID_SIZE_CONNECTABLE;
+                        type = FILEID_BTRFS_WITH_PARENT;
+                }
+        }
+        *max_len = len;
+        return type;
+}
+static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+                                       u64 root_objectid, u32 generation)
+{
+        struct btrfs_root *root;
+        struct inode *inode;
+        struct btrfs_key key;
+        key.objectid = root_objectid;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.offset = (u64)-1;
+        root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
+        if (IS_ERR(root))
+                return ERR_CAST(root);
+        key.objectid = objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        inode = btrfs_iget(sb, &key, root, NULL);
+        if (IS_ERR(inode))
+                return (void *)inode;
+        if (generation != inode->i_generation) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return d_obtain_alias(inode);
+}
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+                                         int fh_len, int fh_type)
+{
+        struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+        u64 objectid, root_objectid;
+        u32 generation;
+        if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+                if (fh_len !=  BTRFS_FID_SIZE_CONNECTABLE)
+                        return NULL;
+                root_objectid = fid->root_objectid;
+        } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+                if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+                        return NULL;
+                root_objectid = fid->parent_root_objectid;
+        } else
+                return NULL;
+        objectid = fid->parent_objectid;
+        generation = fid->parent_gen;
+        return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+                                         int fh_len, int fh_type)
+{
+        struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+        u64 objectid, root_objectid;
+        u32 generation;
+        if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+             fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+            (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+             fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+            (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+             fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+                return NULL;
+        objectid = fid->objectid;
+        root_objectid = fid->root_objectid;
+        generation = fid->gen;
+        return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+static struct dentry *btrfs_get_parent(struct dentry *child)
+{
+        struct inode *dir = child->d_inode;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        int slot;
+        u64 objectid;
+        int ret;
+        path = btrfs_alloc_path();
+        key.objectid = dir->i_ino;
+        btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+        key.offset = (u64)-1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                /* Error */
+                btrfs_free_path(path);
+                return ERR_PTR(ret);
+        }
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        if (ret) {
+                /* btrfs_search_slot() returns the slot where we'd want to
+                   insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
+                   The _real_ backref, telling us what the parent inode
+                   _actually_ is, will be in the slot _before_ the one
+                   that btrfs_search_slot() returns. */
+                if (!slot) {
+                        /* Unless there is _no_ key in the tree before... */
+                        btrfs_free_path(path);
+                        return ERR_PTR(-EIO);
+                }
+                slot--;
+        }
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        btrfs_free_path(path);
+        if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
+                return ERR_PTR(-EINVAL);
+        objectid = key.offset;
+        /* If we are already at the root of a subvol, return the real root */
+        if (objectid == dir->i_ino)
+                return dget(dir->i_sb->s_root);
+        /* Build a new key for the inode item */
+        key.objectid = objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+}
+const struct export_operations btrfs_export_ops = {
+        .encode_fh      = btrfs_encode_fh,
+        .fh_to_dentry   = btrfs_fh_to_dentry,
+        .fh_to_parent   = btrfs_fh_to_parent,
+        .get_parent     = btrfs_get_parent,
+};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+#include <linux/exportfs.h>
+extern const struct export_operations btrfs_export_ops;
+struct btrfs_fid {
+        u64 objectid;
+        u64 root_objectid;
+        u32 gen;
+        u64 parent_objectid;
+        u32 parent_gen;
+        u64 parent_root_objectid;
+} __attribute__ ((packed));
+#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..293da650873f
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5986 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/version.h>
+#include "compat.h"
+#include "hash.h"
+#include "crc32c.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "compat.h"
+#define PENDING_EXTENT_INSERT 0
+#define PENDING_EXTENT_DELETE 1
+#define PENDING_BACKREF_UPDATE 2
+struct pending_extent_op {
+        int type;
+        u64 bytenr;
+        u64 num_bytes;
+        u64 parent;
+        u64 orig_parent;
+        u64 generation;
+        u64 orig_generation;
+        int level;
+        struct list_head list;
+        int del;
+};
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *extent_root, int all);
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root, int all);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          u64 bytenr, u64 num_bytes, int is_data);
+static int update_block_group(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              u64 bytenr, u64 num_bytes, int alloc,
+                              int mark_free);
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+        return (cache->flags & bits) == bits;
+}
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+                                struct btrfs_block_group_cache *block_group)
+{
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        struct btrfs_block_group_cache *cache;
+        spin_lock(&info->block_group_cache_lock);
+        p = &info->block_group_cache_tree.rb_node;
+        while (*p) {
+                parent = *p;
+                cache = rb_entry(parent, struct btrfs_block_group_cache,
+                                 cache_node);
+                if (block_group->key.objectid < cache->key.objectid) {
+                        p = &(*p)->rb_left;
+                } else if (block_group->key.objectid > cache->key.objectid) {
+                        p = &(*p)->rb_right;
+                } else {
+                        spin_unlock(&info->block_group_cache_lock);
+                        return -EEXIST;
+                }
+        }
+        rb_link_node(&block_group->cache_node, parent, p);
+        rb_insert_color(&block_group->cache_node,
+                        &info->block_group_cache_tree);
+        spin_unlock(&info->block_group_cache_lock);
+        return 0;
+}
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+                              int contains)
+{
+        struct btrfs_block_group_cache *cache, *ret = NULL;
+        struct rb_node *n;
+        u64 end, start;
+        spin_lock(&info->block_group_cache_lock);
+        n = info->block_group_cache_tree.rb_node;
+        while (n) {
+                cache = rb_entry(n, struct btrfs_block_group_cache,
+                                 cache_node);
+                end = cache->key.objectid + cache->key.offset - 1;
+                start = cache->key.objectid;
+                if (bytenr < start) {
+                        if (!contains && (!ret || start < ret->key.objectid))
+                                ret = cache;
+                        n = n->rb_left;
+                } else if (bytenr > start) {
+                        if (contains && bytenr <= end) {
+                                ret = cache;
+                                break;
+                        }
+                        n = n->rb_right;
+                } else {
+                        ret = cache;
+                        break;
+                }
+        }
+        if (ret)
+                atomic_inc(&ret->count);
+        spin_unlock(&info->block_group_cache_lock);
+        return ret;
+}
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_fs_info *info, u64 start, u64 end)
+{
+        u64 extent_start, extent_end, size;
+        int ret;
+        mutex_lock(&info->pinned_mutex);
+        while (start < end) {
+                ret = find_first_extent_bit(&info->pinned_extents, start,
+                                            &extent_start, &extent_end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                if (extent_start == start) {
+                        start = extent_end + 1;
+                } else if (extent_start > start && extent_start < end) {
+                        size = extent_start - start;
+                        ret = btrfs_add_free_space(block_group, start,
+                                                   size);
+                        BUG_ON(ret);
+                        start = extent_end + 1;
+                } else {
+                        break;
+                }
+        }
+        if (start < end) {
+                size = end - start;
+                ret = btrfs_add_free_space(block_group, start, size);
+                BUG_ON(ret);
+        }
+        mutex_unlock(&info->pinned_mutex);
+        return 0;
+}
+static int remove_sb_from_cache(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *cache)
+{
+        u64 bytenr;
+        u64 *logical;
+        int stripe_len;
+        int i, nr, ret;
+        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                bytenr = btrfs_sb_offset(i);
+                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+                                       cache->key.objectid, bytenr, 0,
+                                       &logical, &nr, &stripe_len);
+                BUG_ON(ret);
+                while (nr--) {
+                        btrfs_remove_free_space(cache, logical[nr],
+                                                stripe_len);
+                }
+                kfree(logical);
+        }
+        return 0;
+}
+static int cache_block_group(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_path *path;
+        int ret = 0;
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        int slot;
+        u64 last;
+        if (!block_group)
+                return 0;
+        root = root->fs_info->extent_root;
+        if (block_group->cached)
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        /*
+         * we get into deadlocks with paths held by callers of this function.
+         * since the alloc_mutex is protecting things right now, just
+         * skip the locking here
+         */
+        path->skip_locking = 1;
+        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+        key.objectid = last;
+        key.offset = 0;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto err;
+        while (1) {
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto err;
+                        if (ret == 0)
+                                continue;
+                        else
+                                break;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.objectid < block_group->key.objectid)
+                        goto next;
+                if (key.objectid >= block_group->key.objectid +
+                    block_group->key.offset)
+                        break;
+                if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
+                        add_new_free_space(block_group, root->fs_info, last,
+                                           key.objectid);
+                        last = key.objectid + key.offset;
+                }
+next:
+                path->slots[0]++;
+        }
+        add_new_free_space(block_group, root->fs_info, last,
+                           block_group->key.objectid +
+                           block_group->key.offset);
+        remove_sb_from_cache(root, block_group);
+        block_group->cached = 1;
+        ret = 0;
+err:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * return the block group that starts at or after bytenr
+ */
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = block_group_cache_tree_search(info, bytenr, 0);
+        return cache;
+}
+/*
+ * return the block group that contains teh given bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+                                                 struct btrfs_fs_info *info,
+                                                 u64 bytenr)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = block_group_cache_tree_search(info, bytenr, 1);
+        return cache;
+}
+static inline void put_block_group(struct btrfs_block_group_cache *cache)
+{
+        if (atomic_dec_and_test(&cache->count))
+                kfree(cache);
+}
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+                                                  u64 flags)
+{
+        struct list_head *head = &info->space_info;
+        struct list_head *cur;
+        struct btrfs_space_info *found;
+        list_for_each(cur, head) {
+                found = list_entry(cur, struct btrfs_space_info, list);
+                if (found->flags == flags)
+                        return found;
+        }
+        return NULL;
+}
+static u64 div_factor(u64 num, int factor)
+{
+        if (factor == 10)
+                return num;
+        num *= factor;
+        do_div(num, 10);
+        return num;
+}
+u64 btrfs_find_block_group(struct btrfs_root *root,
+                           u64 search_start, u64 search_hint, int owner)
+{
+        struct btrfs_block_group_cache *cache;
+        u64 used;
+        u64 last = max(search_hint, search_start);
+        u64 group_start = 0;
+        int full_search = 0;
+        int factor = 9;
+        int wrapped = 0;
+again:
+        while (1) {
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
+                if (!cache)
+                        break;
+                spin_lock(&cache->lock);
+                last = cache->key.objectid + cache->key.offset;
+                used = btrfs_block_group_used(&cache->item);
+                if ((full_search || !cache->ro) &&
+                    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
+                        if (used + cache->pinned + cache->reserved <
+                            div_factor(cache->key.offset, factor)) {
+                                group_start = cache->key.objectid;
+                                spin_unlock(&cache->lock);
+                                put_block_group(cache);
+                                goto found;
+                        }
+                }
+                spin_unlock(&cache->lock);
+                put_block_group(cache);
+                cond_resched();
+        }
+        if (!wrapped) {
+                last = search_start;
+                wrapped = 1;
+                goto again;
+        }
+        if (!full_search && factor < 10) {
+                last = search_start;
+                full_search = 1;
+                factor = 10;
+                goto again;
+        }
+found:
+        return group_start;
+}
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = start;
+        key.offset = len;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+                                0, 0);
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * Back reference rules.  Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ *    when a reference is dropped we can make sure it was a valid reference
+ *    before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ *    if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ *    maintenance.  This is actually the same as #2, but with a slightly
+ *    different use case.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - Generation number of the tree holding the reference
+ * - objectid of the file holding the reference
+ * - number of references holding by parent node (alway 1 for tree blocks)
+ *
+ * Btree leaf may hold multiple references to a file extent. In most cases,
+ * these references are from same file and the corresponding offsets inside
+ * the file are close together.
+ *
+ * When a file extent is allocated the fields are filled in:
+ *     (root_key.objectid, trans->transid, inode objectid, 1)
+ *
+ * When a leaf is cow'd new references are added for every file extent found
+ * in the leaf.  It looks similar to the create case, but trans->transid will
+ * be different when the block is cow'd.
+ *
+ *     (root_key.objectid, trans->transid, inode objectid,
+ *      number of references in the leaf)
+ *
+ * When a file extent is removed either during snapshot deletion or
+ * file truncation, we find the corresponding back reference and check
+ * the following fields:
+ *
+ *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ *      inode objectid)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ * - Different generations of the same subvolume
+ *
+ * When a tree block is created, back references are inserted:
+ *
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a tree block is cow'd, new back references are added for all the
+ * blocks it points to. If the tree block isn't in reference counted root,
+ * the old back references are removed. These new back references are of
+ * the form (trans->transid will have increased since creation):
+ *
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a backref is in deleting, the following fields are checked:
+ *
+ * if backref was for a tree root:
+ *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
+ * else
+ *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent, the key
+ * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * byte of parent extent. If a extent is tree root, the key offset is set
+ * to the key objectid.
+ */
+static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 ref_root, u64 ref_generation,
+                                          u64 owner_objectid, int del)
+{
+        struct btrfs_key key;
+        struct btrfs_extent_ref *ref;
+        struct extent_buffer *leaf;
+        u64 ref_objectid;
+        int ret;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_REF_KEY;
+        key.offset = parent;
+        ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        ref_objectid = btrfs_ref_objectid(leaf, ref);
+        if (btrfs_ref_root(leaf, ref) != ref_root ||
+            btrfs_ref_generation(leaf, ref) != ref_generation ||
+            (ref_objectid != owner_objectid &&
+             ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+                ret = -EIO;
+                WARN_ON(1);
+                goto out;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+/*
+ * updates all the backrefs that are pending on update_list for the
+ * extent_root
+ */
+static noinline int update_backrefs(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *extent_root,
+                                    struct btrfs_path *path,
+                                    struct list_head *update_list)
+{
+        struct btrfs_key key;
+        struct btrfs_extent_ref *ref;
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct pending_extent_op *op;
+        struct extent_buffer *leaf;
+        int ret = 0;
+        struct list_head *cur = update_list->next;
+        u64 ref_objectid;
+        u64 ref_root = extent_root->root_key.objectid;
+        op = list_entry(cur, struct pending_extent_op, list);
+search:
+        key.objectid = op->bytenr;
+        key.type = BTRFS_EXTENT_REF_KEY;
+        key.offset = op->orig_parent;
+        ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+loop:
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        ref_objectid = btrfs_ref_objectid(leaf, ref);
+        if (btrfs_ref_root(leaf, ref) != ref_root ||
+            btrfs_ref_generation(leaf, ref) != op->orig_generation ||
+            (ref_objectid != op->level &&
+             ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+                printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
+                       "root %llu, owner %u\n",
+                       (unsigned long long)op->bytenr,
+                       (unsigned long long)op->orig_parent,
+                       (unsigned long long)ref_root, op->level);
+                btrfs_print_leaf(extent_root, leaf);
+                BUG();
+        }
+        key.objectid = op->bytenr;
+        key.offset = op->parent;
+        key.type = BTRFS_EXTENT_REF_KEY;
+        ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
+        BUG_ON(ret);
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        btrfs_set_ref_generation(leaf, ref, op->generation);
+        cur = cur->next;
+        list_del_init(&op->list);
+        unlock_extent(&info->extent_ins, op->bytenr,
+                      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+        kfree(op);
+        if (cur == update_list) {
+                btrfs_mark_buffer_dirty(path->nodes[0]);
+                btrfs_release_path(extent_root, path);
+                goto out;
+        }
+        op = list_entry(cur, struct pending_extent_op, list);
+        path->slots[0]++;
+        while (path->slots[0] < btrfs_header_nritems(leaf)) {
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid == op->bytenr &&
+                    key.type == BTRFS_EXTENT_REF_KEY)
+                        goto loop;
+                path->slots[0]++;
+        }
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(extent_root, path);
+        goto search;
+out:
+        return 0;
+}
+static noinline int insert_extents(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *extent_root,
+                                   struct btrfs_path *path,
+                                   struct list_head *insert_list, int nr)
+{
+        struct btrfs_key *keys;
+        u32 *data_size;
+        struct pending_extent_op *op;
+        struct extent_buffer *leaf;
+        struct list_head *cur = insert_list->next;
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        u64 ref_root = extent_root->root_key.objectid;
+        int i = 0, last = 0, ret;
+        int total = nr * 2;
+        if (!nr)
+                return 0;
+        keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
+        if (!keys)
+                return -ENOMEM;
+        data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
+        if (!data_size) {
+                kfree(keys);
+                return -ENOMEM;
+        }
+        list_for_each_entry(op, insert_list, list) {
+                keys[i].objectid = op->bytenr;
+                keys[i].offset = op->num_bytes;
+                keys[i].type = BTRFS_EXTENT_ITEM_KEY;
+                data_size[i] = sizeof(struct btrfs_extent_item);
+                i++;
+                keys[i].objectid = op->bytenr;
+                keys[i].offset = op->parent;
+                keys[i].type = BTRFS_EXTENT_REF_KEY;
+                data_size[i] = sizeof(struct btrfs_extent_ref);
+                i++;
+        }
+        op = list_entry(cur, struct pending_extent_op, list);
+        i = 0;
+        while (i < total) {
+                int c;
+                ret = btrfs_insert_some_items(trans, extent_root, path,
+                                              keys+i, data_size+i, total-i);
+                BUG_ON(ret < 0);
+                if (last && ret > 1)
+                        BUG();
+                leaf = path->nodes[0];
+                for (c = 0; c < ret; c++) {
+                        int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
+                        /*
+                         * if the first item we inserted was a backref, then
+                         * the EXTENT_ITEM will be the odd c's, else it will
+                         * be the even c's
+                         */
+                        if ((ref_first && (c % 2)) ||
+                            (!ref_first && !(c % 2))) {
+                                struct btrfs_extent_item *itm;
+                                itm = btrfs_item_ptr(leaf, path->slots[0] + c,
+                                                     struct btrfs_extent_item);
+                                btrfs_set_extent_refs(path->nodes[0], itm, 1);
+                                op->del++;
+                        } else {
+                                struct btrfs_extent_ref *ref;
+                                ref = btrfs_item_ptr(leaf, path->slots[0] + c,
+                                                     struct btrfs_extent_ref);
+                                btrfs_set_ref_root(leaf, ref, ref_root);
+                                btrfs_set_ref_generation(leaf, ref,
+                                                         op->generation);
+                                btrfs_set_ref_objectid(leaf, ref, op->level);
+                                btrfs_set_ref_num_refs(leaf, ref, 1);
+                                op->del++;
+                        }
+                        /*
+                         * using del to see when its ok to free up the
+                         * pending_extent_op.  In the case where we insert the
+                         * last item on the list in order to help do batching
+                         * we need to not free the extent op until we actually
+                         * insert the extent_item
+                         */
+                        if (op->del == 2) {
+                                unlock_extent(&info->extent_ins, op->bytenr,
+                                              op->bytenr + op->num_bytes - 1,
+                                              GFP_NOFS);
+                                cur = cur->next;
+                                list_del_init(&op->list);
+                                kfree(op);
+                                if (cur != insert_list)
+                                        op = list_entry(cur,
+                                                struct pending_extent_op,
+                                                list);
+                        }
+                }
+                btrfs_mark_buffer_dirty(leaf);
+                btrfs_release_path(extent_root, path);
+                /*
+                 * Ok backref's and items usually go right next to eachother,
+                 * but if we could only insert 1 item that means that we
+                 * inserted on the end of a leaf, and we have no idea what may
+                 * be on the next leaf so we just play it safe.  In order to
+                 * try and help this case we insert the last thing on our
+                 * insert list so hopefully it will end up being the last
+                 * thing on the leaf and everything else will be before it,
+                 * which will let us insert a whole bunch of items at the same
+                 * time.
+                 */
+                if (ret == 1 && !last && (i + ret < total)) {
+                        /*
+                         * last: where we will pick up the next time around
+                         * i: our current key to insert, will be total - 1
+                         * cur: the current op we are screwing with
+                         * op: duh
+                         */
+                        last = i + ret;
+                        i = total - 1;
+                        cur = insert_list->prev;
+                        op = list_entry(cur, struct pending_extent_op, list);
+                } else if (last) {
+                        /*
+                         * ok we successfully inserted the last item on the
+                         * list, lets reset everything
+                         *
+                         * i: our current key to insert, so where we left off
+                         *    last time
+                         * last: done with this
+                         * cur: the op we are messing with
+                         * op: duh
+                         * total: since we inserted the last key, we need to
+                         *        decrement total so we dont overflow
+                         */
+                        i = last;
+                        last = 0;
+                        total--;
+                        if (i < total) {
+                                cur = insert_list->next;
+                                op = list_entry(cur, struct pending_extent_op,
+                                                list);
+                        }
+                } else {
+                        i += ret;
+                }
+                cond_resched();
+        }
+        ret = 0;
+        kfree(keys);
+        kfree(data_size);
+        return ret;
+}
+static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 ref_root, u64 ref_generation,
+                                          u64 owner_objectid)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_ref *ref;
+        u32 num_refs;
+        int ret;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_REF_KEY;
+        key.offset = parent;
+        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+        if (ret == 0) {
+                leaf = path->nodes[0];
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_ref);
+                btrfs_set_ref_root(leaf, ref, ref_root);
+                btrfs_set_ref_generation(leaf, ref, ref_generation);
+                btrfs_set_ref_objectid(leaf, ref, owner_objectid);
+                btrfs_set_ref_num_refs(leaf, ref, 1);
+        } else if (ret == -EEXIST) {
+                u64 existing_owner;
+                BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
+                leaf = path->nodes[0];
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_ref);
+                if (btrfs_ref_root(leaf, ref) != ref_root ||
+                    btrfs_ref_generation(leaf, ref) != ref_generation) {
+                        ret = -EIO;
+                        WARN_ON(1);
+                        goto out;
+                }
+                num_refs = btrfs_ref_num_refs(leaf, ref);
+                BUG_ON(num_refs == 0);
+                btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+                existing_owner = btrfs_ref_objectid(leaf, ref);
+                if (existing_owner != owner_objectid &&
+                    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+                        btrfs_set_ref_objectid(leaf, ref,
+                                        BTRFS_MULTIPLE_OBJECTIDS);
+                }
+                ret = 0;
+        } else {
+                goto out;
+        }
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+        btrfs_release_path(root, path);
+        return ret;
+}
+static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_extent_ref *ref;
+        u32 num_refs;
+        int ret = 0;
+        leaf = path->nodes[0];
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        num_refs = btrfs_ref_num_refs(leaf, ref);
+        BUG_ON(num_refs == 0);
+        num_refs -= 1;
+        if (num_refs == 0) {
+                ret = btrfs_del_item(trans, root, path);
+        } else {
+                btrfs_set_ref_num_refs(leaf, ref, num_refs);
+                btrfs_mark_buffer_dirty(leaf);
+        }
+        btrfs_release_path(root, path);
+        return ret;
+}
+#ifdef BIO_RW_DISCARD
+static void btrfs_issue_discard(struct block_device *bdev,
+                                u64 start, u64 len)
+{
+        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+}
+#endif
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+                                u64 num_bytes)
+{
+#ifdef BIO_RW_DISCARD
+        int ret;
+        u64 map_length = num_bytes;
+        struct btrfs_multi_bio *multi = NULL;
+        /* Tell the block device(s) that the sectors can be discarded */
+        ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+                              bytenr, &map_length, &multi, 0);
+        if (!ret) {
+                struct btrfs_bio_stripe *stripe = multi->stripes;
+                int i;
+                if (map_length > num_bytes)
+                        map_length = num_bytes;
+                for (i = 0; i < multi->num_stripes; i++, stripe++) {
+                        btrfs_issue_discard(stripe->dev->bdev,
+                                            stripe->physical,
+                                            map_length);
+                }
+                kfree(multi);
+        }
+        return ret;
+#else
+        return 0;
+#endif
+}
+static noinline int free_extents(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *extent_root,
+                                 struct list_head *del_list)
+{
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct btrfs_path *path;
+        struct btrfs_key key, found_key;
+        struct extent_buffer *leaf;
+        struct list_head *cur;
+        struct pending_extent_op *op;
+        struct btrfs_extent_item *ei;
+        int ret, num_to_del, extent_slot = 0, found_extent = 0;
+        u32 refs;
+        u64 bytes_freed = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 1;
+search:
+        /* search for the backref for the current ref we want to delete */
+        cur = del_list->next;
+        op = list_entry(cur, struct pending_extent_op, list);
+        ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
+                                    op->orig_parent,
+                                    extent_root->root_key.objectid,
+                                    op->orig_generation, op->level, 1);
+        if (ret) {
+                printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
+                       "root %llu gen %llu owner %u\n",
+                       (unsigned long long)op->bytenr,
+                       (unsigned long long)extent_root->root_key.objectid,
+                       (unsigned long long)op->orig_generation, op->level);
+                btrfs_print_leaf(extent_root, path->nodes[0]);
+                WARN_ON(1);
+                goto out;
+        }
+        extent_slot = path->slots[0];
+        num_to_del = 1;
+        found_extent = 0;
+        /*
+         * if we aren't the first item on the leaf we can move back one and see
+         * if our ref is right next to our extent item
+         */
+        if (likely(extent_slot)) {
+                extent_slot--;
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      extent_slot);
+                if (found_key.objectid == op->bytenr &&
+                    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+                    found_key.offset == op->num_bytes) {
+                        num_to_del++;
+                        found_extent = 1;
+                }
+        }
+        /*
+         * if we didn't find the extent we need to delete the backref and then
+         * search for the extent item key so we can update its ref count
+         */
+        if (!found_extent) {
+                key.objectid = op->bytenr;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = op->num_bytes;
+                ret = remove_extent_backref(trans, extent_root, path);
+                BUG_ON(ret);
+                btrfs_release_path(extent_root, path);
+                ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
+                BUG_ON(ret);
+                extent_slot = path->slots[0];
+        }
+        /* this is where we update the ref count for the extent */
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        BUG_ON(refs == 0);
+        refs--;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        btrfs_mark_buffer_dirty(leaf);
+        /*
+         * This extent needs deleting.  The reason cur_slot is extent_slot +
+         * num_to_del is because extent_slot points to the slot where the extent
+         * is, and if the backref was not right next to the extent we will be
+         * deleting at least 1 item, and will want to start searching at the
+         * slot directly next to extent_slot.  However if we did find the
+         * backref next to the extent item them we will be deleting at least 2
+         * items and will want to start searching directly after the ref slot
+         */
+        if (!refs) {
+                struct list_head *pos, *n, *end;
+                int cur_slot = extent_slot+num_to_del;
+                u64 super_used;
+                u64 root_used;
+                path->slots[0] = extent_slot;
+                bytes_freed = op->num_bytes;
+                mutex_lock(&info->pinned_mutex);
+                ret = pin_down_bytes(trans, extent_root, op->bytenr,
+                                     op->num_bytes, op->level >=
+                                     BTRFS_FIRST_FREE_OBJECTID);
+                mutex_unlock(&info->pinned_mutex);
+                BUG_ON(ret < 0);
+                op->del = ret;
+                /*
+                 * we need to see if we can delete multiple things at once, so
+                 * start looping through the list of extents we are wanting to
+                 * delete and see if their extent/backref's are right next to
+                 * eachother and the extents only have 1 ref
+                 */
+                for (pos = cur->next; pos != del_list; pos = pos->next) {
+                        struct pending_extent_op *tmp;
+                        tmp = list_entry(pos, struct pending_extent_op, list);
+                        /* we only want to delete extent+ref at this stage */
+                        if (cur_slot >= btrfs_header_nritems(leaf) - 1)
+                                break;
+                        btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
+                        if (found_key.objectid != tmp->bytenr ||
+                            found_key.type != BTRFS_EXTENT_ITEM_KEY ||
+                            found_key.offset != tmp->num_bytes)
+                                break;
+                        /* check to make sure this extent only has one ref */
+                        ei = btrfs_item_ptr(leaf, cur_slot,
+                                            struct btrfs_extent_item);
+                        if (btrfs_extent_refs(leaf, ei) != 1)
+                                break;
+                        btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
+                        if (found_key.objectid != tmp->bytenr ||
+                            found_key.type != BTRFS_EXTENT_REF_KEY ||
+                            found_key.offset != tmp->orig_parent)
+                                break;
+                        /*
+                         * the ref is right next to the extent, we can set the
+                         * ref count to 0 since we will delete them both now
+                         */
+                        btrfs_set_extent_refs(leaf, ei, 0);
+                        /* pin down the bytes for this extent */
+                        mutex_lock(&info->pinned_mutex);
+                        ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
+                                             tmp->num_bytes, tmp->level >=
+                                             BTRFS_FIRST_FREE_OBJECTID);
+                        mutex_unlock(&info->pinned_mutex);
+                        BUG_ON(ret < 0);
+                        /*
+                         * use the del field to tell if we need to go ahead and
+                         * free up the extent when we delete the item or not.
+                         */
+                        tmp->del = ret;
+                        bytes_freed += tmp->num_bytes;
+                        num_to_del += 2;
+                        cur_slot += 2;
+                }
+                end = pos;
+                /* update the free space counters */
+                spin_lock(&info->delalloc_lock);
+                super_used = btrfs_super_bytes_used(&info->super_copy);
+                btrfs_set_super_bytes_used(&info->super_copy,
+                                           super_used - bytes_freed);
+                root_used = btrfs_root_used(&extent_root->root_item);
+                btrfs_set_root_used(&extent_root->root_item,
+                                    root_used - bytes_freed);
+                spin_unlock(&info->delalloc_lock);
+                /* delete the items */
+                ret = btrfs_del_items(trans, extent_root, path,
+                                      path->slots[0], num_to_del);
+                BUG_ON(ret);
+                /*
+                 * loop through the extents we deleted and do the cleanup work
+                 * on them
+                 */
+                for (pos = cur, n = pos->next; pos != end;
+                     pos = n, n = pos->next) {
+                        struct pending_extent_op *tmp;
+                        tmp = list_entry(pos, struct pending_extent_op, list);
+                        /*
+                         * remember tmp->del tells us wether or not we pinned
+                         * down the extent
+                         */
+                        ret = update_block_group(trans, extent_root,
+                                                 tmp->bytenr, tmp->num_bytes, 0,
+                                                 tmp->del);
+                        BUG_ON(ret);
+                        list_del_init(&tmp->list);
+                        unlock_extent(&info->extent_ins, tmp->bytenr,
+                                      tmp->bytenr + tmp->num_bytes - 1,
+                                      GFP_NOFS);
+                        kfree(tmp);
+                }
+        } else if (refs && found_extent) {
+                /*
+                 * the ref and extent were right next to eachother, but the
+                 * extent still has a ref, so just free the backref and keep
+                 * going
+                 */
+                ret = remove_extent_backref(trans, extent_root, path);
+                BUG_ON(ret);
+                list_del_init(&op->list);
+                unlock_extent(&info->extent_ins, op->bytenr,
+                              op->bytenr + op->num_bytes - 1, GFP_NOFS);
+                kfree(op);
+        } else {
+                /*
+                 * the extent has multiple refs and the backref we were looking
+                 * for was not right next to it, so just unlock and go next,
+                 * we're good to go
+                 */
+                list_del_init(&op->list);
+                unlock_extent(&info->extent_ins, op->bytenr,
+                              op->bytenr + op->num_bytes - 1, GFP_NOFS);
+                kfree(op);
+        }
+        btrfs_release_path(extent_root, path);
+        if (!list_empty(del_list))
+                goto search;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root, u64 bytenr,
+                                     u64 orig_parent, u64 parent,
+                                     u64 orig_root, u64 ref_root,
+                                     u64 orig_generation, u64 ref_generation,
+                                     u64 owner_objectid)
+{
+        int ret;
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        struct btrfs_path *path;
+        if (root == root->fs_info->extent_root) {
+                struct pending_extent_op *extent_op;
+                u64 num_bytes;
+                BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
+                num_bytes = btrfs_level_size(root, (int)owner_objectid);
+                mutex_lock(&root->fs_info->extent_ins_mutex);
+                if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+                                bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+                        u64 priv;
+                        ret = get_state_private(&root->fs_info->extent_ins,
+                                                bytenr, &priv);
+                        BUG_ON(ret);
+                        extent_op = (struct pending_extent_op *)
+                                                        (unsigned long)priv;
+                        BUG_ON(extent_op->parent != orig_parent);
+                        BUG_ON(extent_op->generation != orig_generation);
+                        extent_op->parent = parent;
+                        extent_op->generation = ref_generation;
+                } else {
+                        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                        BUG_ON(!extent_op);
+                        extent_op->type = PENDING_BACKREF_UPDATE;
+                        extent_op->bytenr = bytenr;
+                        extent_op->num_bytes = num_bytes;
+                        extent_op->parent = parent;
+                        extent_op->orig_parent = orig_parent;
+                        extent_op->generation = ref_generation;
+                        extent_op->orig_generation = orig_generation;
+                        extent_op->level = (int)owner_objectid;
+                        INIT_LIST_HEAD(&extent_op->list);
+                        extent_op->del = 0;
+                        set_extent_bits(&root->fs_info->extent_ins,
+                                        bytenr, bytenr + num_bytes - 1,
+                                        EXTENT_WRITEBACK, GFP_NOFS);
+                        set_state_private(&root->fs_info->extent_ins,
+                                          bytenr, (unsigned long)extent_op);
+                }
+                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                return 0;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = lookup_extent_backref(trans, extent_root, path,
+                                    bytenr, orig_parent, orig_root,
+                                    orig_generation, owner_objectid, 1);
+        if (ret)
+                goto out;
+        ret = remove_extent_backref(trans, extent_root, path);
+        if (ret)
+                goto out;
+        ret = insert_extent_backref(trans, extent_root, path, bytenr,
+                                    parent, ref_root, ref_generation,
+                                    owner_objectid);
+        BUG_ON(ret);
+        finish_current_insert(trans, extent_root, 0);
+        del_pending_extents(trans, extent_root, 0);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 orig_parent, u64 parent,
+                            u64 ref_root, u64 ref_generation,
+                            u64 owner_objectid)
+{
+        int ret;
+        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+                return 0;
+        ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
+                                        parent, ref_root, ref_root,
+                                        ref_generation, ref_generation,
+                                        owner_objectid);
+        return ret;
+}
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root, u64 bytenr,
+                                  u64 orig_parent, u64 parent,
+                                  u64 orig_root, u64 ref_root,
+                                  u64 orig_generation, u64 ref_generation,
+                                  u64 owner_objectid)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_key key;
+        struct extent_buffer *l;
+        struct btrfs_extent_item *item;
+        u32 refs;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 1;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = (u64)-1;
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+                                0, 1);
+        if (ret < 0)
+                return ret;
+        BUG_ON(ret == 0 || path->slots[0] == 0);
+        path->slots[0]--;
+        l = path->nodes[0];
+        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+        if (key.objectid != bytenr) {
+                btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+                printk(KERN_ERR "btrfs wanted %llu found %llu\n",
+                       (unsigned long long)bytenr,
+                       (unsigned long long)key.objectid);
+                BUG();
+        }
+        BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
+        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(l, item);
+        btrfs_set_extent_refs(l, item, refs + 1);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(root->fs_info->extent_root, path);
+        path->reada = 1;
+        ret = insert_extent_backref(trans, root->fs_info->extent_root,
+                                    path, bytenr, parent,
+                                    ref_root, ref_generation,
+                                    owner_objectid);
+        BUG_ON(ret);
+        finish_current_insert(trans, root->fs_info->extent_root, 0);
+        del_pending_extents(trans, root->fs_info->extent_root, 0);
+        btrfs_free_path(path);
+        return 0;
+}
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         u64 bytenr, u64 num_bytes, u64 parent,
+                         u64 ref_root, u64 ref_generation,
+                         u64 owner_objectid)
+{
+        int ret;
+        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+                return 0;
+        ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+                                     0, ref_root, 0, ref_generation,
+                                     owner_objectid);
+        return ret;
+}
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root)
+{
+        finish_current_insert(trans, root->fs_info->extent_root, 1);
+        del_pending_extents(trans, root->fs_info->extent_root, 1);
+        return 0;
+}
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 num_bytes, u32 *refs)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_key key;
+        struct extent_buffer *l;
+        struct btrfs_extent_item *item;
+        WARN_ON(num_bytes < root->sectorsize);
+        path = btrfs_alloc_path();
+        path->reada = 1;
+        key.objectid = bytenr;
+        key.offset = num_bytes;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+                                0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret != 0) {
+                btrfs_print_leaf(root, path->nodes[0]);
+                printk(KERN_INFO "btrfs failed to find block number %llu\n",
+                       (unsigned long long)bytenr);
+                BUG();
+        }
+        l = path->nodes[0];
+        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+        *refs = btrfs_extent_refs(l, item);
+out:
+        btrfs_free_path(path);
+        return 0;
+}
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 objectid, u64 bytenr)
+{
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_ref *ref_item;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        u64 ref_root;
+        u64 last_snapshot;
+        u32 nritems;
+        int ret;
+        key.objectid = bytenr;
+        key.offset = (u64)-1;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        path = btrfs_alloc_path();
+        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret == 0);
+        ret = -ENOENT;
+        if (path->slots[0] == 0)
+                goto out;
+        path->slots[0]--;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        if (found_key.objectid != bytenr ||
+            found_key.type != BTRFS_EXTENT_ITEM_KEY)
+                goto out;
+        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(extent_root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret == 0)
+                                continue;
+                        break;
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid != bytenr)
+                        break;
+                if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+                        path->slots[0]++;
+                        continue;
+                }
+                ref_item = btrfs_item_ptr(leaf, path->slots[0],
+                                          struct btrfs_extent_ref);
+                ref_root = btrfs_ref_root(leaf, ref_item);
+                if ((ref_root != root->root_key.objectid &&
+                     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
+                     objectid != btrfs_ref_objectid(leaf, ref_item)) {
+                        ret = 1;
+                        goto out;
+                }
+                if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
+                        ret = 1;
+                        goto out;
+                }
+                path->slots[0]++;
+        }
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                    struct extent_buffer *buf, u32 nr_extents)
+{
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        u64 root_gen;
+        u32 nritems;
+        int i;
+        int level;
+        int ret = 0;
+        int shared = 0;
+        if (!root->ref_cows)
+                return 0;
+        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+                shared = 0;
+                root_gen = root->root_key.offset;
+        } else {
+                shared = 1;
+                root_gen = trans->transid - 1;
+        }
+        level = btrfs_header_level(buf);
+        nritems = btrfs_header_nritems(buf);
+        if (level == 0) {
+                struct btrfs_leaf_ref *ref;
+                struct btrfs_extent_info *info;
+                ref = btrfs_alloc_leaf_ref(root, nr_extents);
+                if (!ref) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ref->root_gen = root_gen;
+                ref->bytenr = buf->start;
+                ref->owner = btrfs_header_owner(buf);
+                ref->generation = btrfs_header_generation(buf);
+                ref->nritems = nr_extents;
+                info = ref->extents;
+                for (i = 0; nr_extents > 0 && i < nritems; i++) {
+                        u64 disk_bytenr;
+                        btrfs_item_key_to_cpu(buf, &key, i);
+                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        fi = btrfs_item_ptr(buf, i,
+                                            struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_type(buf, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE)
+                                continue;
+                        disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                        if (disk_bytenr == 0)
+                                continue;
+                        info->bytenr = disk_bytenr;
+                        info->num_bytes =
+                                btrfs_file_extent_disk_num_bytes(buf, fi);
+                        info->objectid = key.objectid;
+                        info->offset = key.offset;
+                        info++;
+                }
+                ret = btrfs_add_leaf_ref(root, ref, shared);
+                if (ret == -EEXIST && shared) {
+                        struct btrfs_leaf_ref *old;
+                        old = btrfs_lookup_leaf_ref(root, ref->bytenr);
+                        BUG_ON(!old);
+                        btrfs_remove_leaf_ref(root, old);
+                        btrfs_free_leaf_ref(root, old);
+                        ret = btrfs_add_leaf_ref(root, ref, shared);
+                }
+                WARN_ON(ret);
+                btrfs_free_leaf_ref(root, ref);
+        }
+out:
+        return ret;
+}
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+                  u32 *nr_extents)
+{
+        u64 bytenr;
+        u64 ref_root;
+        u64 orig_root;
+        u64 ref_generation;
+        u64 orig_generation;
+        u32 nritems;
+        u32 nr_file_extents = 0;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        int i;
+        int level;
+        int ret = 0;
+        int faili = 0;
+        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+                            u64, u64, u64, u64, u64, u64, u64, u64);
+        ref_root = btrfs_header_owner(buf);
+        ref_generation = btrfs_header_generation(buf);
+        orig_root = btrfs_header_owner(orig_buf);
+        orig_generation = btrfs_header_generation(orig_buf);
+        nritems = btrfs_header_nritems(buf);
+        level = btrfs_header_level(buf);
+        if (root->ref_cows) {
+                process_func = __btrfs_inc_extent_ref;
+        } else {
+                if (level == 0 &&
+                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+                        goto out;
+                if (level != 0 &&
+                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+                        goto out;
+                process_func = __btrfs_update_extent_ref;
+        }
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                if (level == 0) {
+                        btrfs_item_key_to_cpu(buf, &key, i);
+                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        fi = btrfs_item_ptr(buf, i,
+                                            struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_type(buf, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE)
+                                continue;
+                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                        if (bytenr == 0)
+                                continue;
+                        nr_file_extents++;
+                        ret = process_func(trans, root, bytenr,
+                                           orig_buf->start, buf->start,
+                                           orig_root, ref_root,
+                                           orig_generation, ref_generation,
+                                           key.objectid);
+                        if (ret) {
+                                faili = i;
+                                WARN_ON(1);
+                                goto fail;
+                        }
+                } else {
+                        bytenr = btrfs_node_blockptr(buf, i);
+                        ret = process_func(trans, root, bytenr,
+                                           orig_buf->start, buf->start,
+                                           orig_root, ref_root,
+                                           orig_generation, ref_generation,
+                                           level - 1);
+                        if (ret) {
+                                faili = i;
+                                WARN_ON(1);
+                                goto fail;
+                        }
+                }
+        }
+out:
+        if (nr_extents) {
+                if (level == 0)
+                        *nr_extents = nr_file_extents;
+                else
+                        *nr_extents = nritems;
+        }
+        return 0;
+fail:
+        WARN_ON(1);
+        return ret;
+}
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                     struct extent_buffer *buf, int start_slot, int nr)
+{
+        u64 bytenr;
+        u64 ref_root;
+        u64 orig_root;
+        u64 ref_generation;
+        u64 orig_generation;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        int i;
+        int ret;
+        int slot;
+        int level;
+        BUG_ON(start_slot < 0);
+        BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
+        ref_root = btrfs_header_owner(buf);
+        ref_generation = btrfs_header_generation(buf);
+        orig_root = btrfs_header_owner(orig_buf);
+        orig_generation = btrfs_header_generation(orig_buf);
+        level = btrfs_header_level(buf);
+        if (!root->ref_cows) {
+                if (level == 0 &&
+                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+                        return 0;
+                if (level != 0 &&
+                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+                        return 0;
+        }
+        for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+                cond_resched();
+                if (level == 0) {
+                        btrfs_item_key_to_cpu(buf, &key, slot);
+                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        fi = btrfs_item_ptr(buf, slot,
+                                            struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_type(buf, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE)
+                                continue;
+                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                        if (bytenr == 0)
+                                continue;
+                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
+                                            orig_buf->start, buf->start,
+                                            orig_root, ref_root,
+                                            orig_generation, ref_generation,
+                                            key.objectid);
+                        if (ret)
+                                goto fail;
+                } else {
+                        bytenr = btrfs_node_blockptr(buf, slot);
+                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
+                                            orig_buf->start, buf->start,
+                                            orig_root, ref_root,
+                                            orig_generation, ref_generation,
+                                            level - 1);
+                        if (ret)
+                                goto fail;
+                }
+        }
+        return 0;
+fail:
+        WARN_ON(1);
+        return -1;
+}
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_block_group_cache *cache)
+{
+        int ret;
+        int pending_ret;
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        unsigned long bi;
+        struct extent_buffer *leaf;
+        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+        if (ret < 0)
+                goto fail;
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(extent_root, path);
+fail:
+        finish_current_insert(trans, extent_root, 0);
+        pending_ret = del_pending_extents(trans, extent_root, 0);
+        if (ret)
+                return ret;
+        if (pending_ret)
+                return pending_ret;
+        return 0;
+}
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root)
+{
+        struct btrfs_block_group_cache *cache, *entry;
+        struct rb_node *n;
+        int err = 0;
+        int werr = 0;
+        struct btrfs_path *path;
+        u64 last = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        while (1) {
+                cache = NULL;
+                spin_lock(&root->fs_info->block_group_cache_lock);
+                for (n = rb_first(&root->fs_info->block_group_cache_tree);
+                     n; n = rb_next(n)) {
+                        entry = rb_entry(n, struct btrfs_block_group_cache,
+                                         cache_node);
+                        if (entry->dirty) {
+                                cache = entry;
+                                break;
+                        }
+                }
+                spin_unlock(&root->fs_info->block_group_cache_lock);
+                if (!cache)
+                        break;
+                cache->dirty = 0;
+                last += cache->key.offset;
+                err = write_one_cache_group(trans, root,
+                                            path, cache);
+                /*
+                 * if we fail to write the cache group, we want
+                 * to keep it marked dirty in hopes that a later
+                 * write will work
+                 */
+                if (err) {
+                        werr = err;
+                        continue;
+                }
+        }
+        btrfs_free_path(path);
+        return werr;
+}
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+        struct btrfs_block_group_cache *block_group;
+        int readonly = 0;
+        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+        if (!block_group || block_group->ro)
+                readonly = 1;
+        if (block_group)
+                put_block_group(block_group);
+        return readonly;
+}
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+                             u64 total_bytes, u64 bytes_used,
+                             struct btrfs_space_info **space_info)
+{
+        struct btrfs_space_info *found;
+        found = __find_space_info(info, flags);
+        if (found) {
+                spin_lock(&found->lock);
+                found->total_bytes += total_bytes;
+                found->bytes_used += bytes_used;
+                found->full = 0;
+                spin_unlock(&found->lock);
+                *space_info = found;
+                return 0;
+        }
+        found = kzalloc(sizeof(*found), GFP_NOFS);
+        if (!found)
+                return -ENOMEM;
+        list_add(&found->list, &info->space_info);
+        INIT_LIST_HEAD(&found->block_groups);
+        init_rwsem(&found->groups_sem);
+        spin_lock_init(&found->lock);
+        found->flags = flags;
+        found->total_bytes = total_bytes;
+        found->bytes_used = bytes_used;
+        found->bytes_pinned = 0;
+        found->bytes_reserved = 0;
+        found->bytes_readonly = 0;
+        found->full = 0;
+        found->force_alloc = 0;
+        *space_info = found;
+        return 0;
+}
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+        u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+                                   BTRFS_BLOCK_GROUP_RAID1 |
+                                   BTRFS_BLOCK_GROUP_RAID10 |
+                                   BTRFS_BLOCK_GROUP_DUP);
+        if (extra_flags) {
+                if (flags & BTRFS_BLOCK_GROUP_DATA)
+                        fs_info->avail_data_alloc_bits |= extra_flags;
+                if (flags & BTRFS_BLOCK_GROUP_METADATA)
+                        fs_info->avail_metadata_alloc_bits |= extra_flags;
+                if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+                        fs_info->avail_system_alloc_bits |= extra_flags;
+        }
+}
+static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
+{
+        spin_lock(&cache->space_info->lock);
+        spin_lock(&cache->lock);
+        if (!cache->ro) {
+                cache->space_info->bytes_readonly += cache->key.offset -
+                                        btrfs_block_group_used(&cache->item);
+                cache->ro = 1;
+        }
+        spin_unlock(&cache->lock);
+        spin_unlock(&cache->space_info->lock);
+}
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+{
+        u64 num_devices = root->fs_info->fs_devices->rw_devices;
+        if (num_devices == 1)
+                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+        if (num_devices < 4)
+                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+        if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+            (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                      BTRFS_BLOCK_GROUP_RAID10))) {
+                flags &= ~BTRFS_BLOCK_GROUP_DUP;
+        }
+        if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
+            (flags & BTRFS_BLOCK_GROUP_RAID10)) {
+                flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+        }
+        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+             (flags & BTRFS_BLOCK_GROUP_RAID10) |
+             (flags & BTRFS_BLOCK_GROUP_DUP)))
+                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+        return flags;
+}
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *extent_root, u64 alloc_bytes,
+                          u64 flags, int force)
+{
+        struct btrfs_space_info *space_info;
+        u64 thresh;
+        int ret = 0;
+        mutex_lock(&extent_root->fs_info->chunk_mutex);
+        flags = btrfs_reduce_alloc_profile(extent_root, flags);
+        space_info = __find_space_info(extent_root->fs_info, flags);
+        if (!space_info) {
+                ret = update_space_info(extent_root->fs_info, flags,
+                                        0, 0, &space_info);
+                BUG_ON(ret);
+        }
+        BUG_ON(!space_info);
+        spin_lock(&space_info->lock);
+        if (space_info->force_alloc) {
+                force = 1;
+                space_info->force_alloc = 0;
+        }
+        if (space_info->full) {
+                spin_unlock(&space_info->lock);
+                goto out;
+        }
+        thresh = space_info->total_bytes - space_info->bytes_readonly;
+        thresh = div_factor(thresh, 6);
+        if (!force &&
+           (space_info->bytes_used + space_info->bytes_pinned +
+            space_info->bytes_reserved + alloc_bytes) < thresh) {
+                spin_unlock(&space_info->lock);
+                goto out;
+        }
+        spin_unlock(&space_info->lock);
+        ret = btrfs_alloc_chunk(trans, extent_root, flags);
+        if (ret)
+                space_info->full = 1;
+out:
+        mutex_unlock(&extent_root->fs_info->chunk_mutex);
+        return ret;
+}
+static int update_block_group(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              u64 bytenr, u64 num_bytes, int alloc,
+                              int mark_free)
+{
+        struct btrfs_block_group_cache *cache;
+        struct btrfs_fs_info *info = root->fs_info;
+        u64 total = num_bytes;
+        u64 old_val;
+        u64 byte_in_group;
+        while (total) {
+                cache = btrfs_lookup_block_group(info, bytenr);
+                if (!cache)
+                        return -1;
+                byte_in_group = bytenr - cache->key.objectid;
+                WARN_ON(byte_in_group > cache->key.offset);
+                spin_lock(&cache->space_info->lock);
+                spin_lock(&cache->lock);
+                cache->dirty = 1;
+                old_val = btrfs_block_group_used(&cache->item);
+                num_bytes = min(total, cache->key.offset - byte_in_group);
+                if (alloc) {
+                        old_val += num_bytes;
+                        cache->space_info->bytes_used += num_bytes;
+                        if (cache->ro)
+                                cache->space_info->bytes_readonly -= num_bytes;
+                        btrfs_set_block_group_used(&cache->item, old_val);
+                        spin_unlock(&cache->lock);
+                        spin_unlock(&cache->space_info->lock);
+                } else {
+                        old_val -= num_bytes;
+                        cache->space_info->bytes_used -= num_bytes;
+                        if (cache->ro)
+                                cache->space_info->bytes_readonly += num_bytes;
+                        btrfs_set_block_group_used(&cache->item, old_val);
+                        spin_unlock(&cache->lock);
+                        spin_unlock(&cache->space_info->lock);
+                        if (mark_free) {
+                                int ret;
+                                ret = btrfs_discard_extent(root, bytenr,
+                                                           num_bytes);
+                                WARN_ON(ret);
+                                ret = btrfs_add_free_space(cache, bytenr,
+                                                           num_bytes);
+                                WARN_ON(ret);
+                        }
+                }
+                put_block_group(cache);
+                total -= num_bytes;
+                bytenr += num_bytes;
+        }
+        return 0;
+}
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+        struct btrfs_block_group_cache *cache;
+        u64 bytenr;
+        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+        if (!cache)
+                return 0;
+        bytenr = cache->key.objectid;
+        put_block_group(cache);
+        return bytenr;
+}
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+                                u64 bytenr, u64 num, int pin)
+{
+        u64 len;
+        struct btrfs_block_group_cache *cache;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
+        if (pin) {
+                set_extent_dirty(&fs_info->pinned_extents,
+                                bytenr, bytenr + num - 1, GFP_NOFS);
+        } else {
+                clear_extent_dirty(&fs_info->pinned_extents,
+                                bytenr, bytenr + num - 1, GFP_NOFS);
+        }
+        while (num > 0) {
+                cache = btrfs_lookup_block_group(fs_info, bytenr);
+                BUG_ON(!cache);
+                len = min(num, cache->key.offset -
+                          (bytenr - cache->key.objectid));
+                if (pin) {
+                        spin_lock(&cache->space_info->lock);
+                        spin_lock(&cache->lock);
+                        cache->pinned += len;
+                        cache->space_info->bytes_pinned += len;
+                        spin_unlock(&cache->lock);
+                        spin_unlock(&cache->space_info->lock);
+                        fs_info->total_pinned += len;
+                } else {
+                        spin_lock(&cache->space_info->lock);
+                        spin_lock(&cache->lock);
+                        cache->pinned -= len;
+                        cache->space_info->bytes_pinned -= len;
+                        spin_unlock(&cache->lock);
+                        spin_unlock(&cache->space_info->lock);
+                        fs_info->total_pinned -= len;
+                        if (cache->cached)
+                                btrfs_add_free_space(cache, bytenr, len);
+                }
+                put_block_group(cache);
+                bytenr += len;
+                num -= len;
+        }
+        return 0;
+}
+static int update_reserved_extents(struct btrfs_root *root,
+                                   u64 bytenr, u64 num, int reserve)
+{
+        u64 len;
+        struct btrfs_block_group_cache *cache;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        while (num > 0) {
+                cache = btrfs_lookup_block_group(fs_info, bytenr);
+                BUG_ON(!cache);
+                len = min(num, cache->key.offset -
+                          (bytenr - cache->key.objectid));
+                spin_lock(&cache->space_info->lock);
+                spin_lock(&cache->lock);
+                if (reserve) {
+                        cache->reserved += len;
+                        cache->space_info->bytes_reserved += len;
+                } else {
+                        cache->reserved -= len;
+                        cache->space_info->bytes_reserved -= len;
+                }
+                spin_unlock(&cache->lock);
+                spin_unlock(&cache->space_info->lock);
+                put_block_group(cache);
+                bytenr += len;
+                num -= len;
+        }
+        return 0;
+}
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
+{
+        u64 last = 0;
+        u64 start;
+        u64 end;
+        struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
+        int ret;
+        mutex_lock(&root->fs_info->pinned_mutex);
+        while (1) {
+                ret = find_first_extent_bit(pinned_extents, last,
+                                            &start, &end, EXTENT_DIRTY);
+                if (ret)
+                        break;
+                set_extent_dirty(copy, start, end, GFP_NOFS);
+                last = end + 1;
+        }
+        mutex_unlock(&root->fs_info->pinned_mutex);
+        return 0;
+}
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct extent_io_tree *unpin)
+{
+        u64 start;
+        u64 end;
+        int ret;
+        mutex_lock(&root->fs_info->pinned_mutex);
+        while (1) {
+                ret = find_first_extent_bit(unpin, 0, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                ret = btrfs_discard_extent(root, start, end + 1 - start);
+                btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
+                clear_extent_dirty(unpin, start, end, GFP_NOFS);
+                if (need_resched()) {
+                        mutex_unlock(&root->fs_info->pinned_mutex);
+                        cond_resched();
+                        mutex_lock(&root->fs_info->pinned_mutex);
+                }
+        }
+        mutex_unlock(&root->fs_info->pinned_mutex);
+        return ret;
+}
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *extent_root, int all)
+{
+        u64 start;
+        u64 end;
+        u64 priv;
+        u64 search = 0;
+        u64 skipped = 0;
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct btrfs_path *path;
+        struct pending_extent_op *extent_op, *tmp;
+        struct list_head insert_list, update_list;
+        int ret;
+        int num_inserts = 0, max_inserts;
+        path = btrfs_alloc_path();
+        INIT_LIST_HEAD(&insert_list);
+        INIT_LIST_HEAD(&update_list);
+        max_inserts = extent_root->leafsize /
+                (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
+                 sizeof(struct btrfs_extent_ref) +
+                 sizeof(struct btrfs_extent_item));
+again:
+        mutex_lock(&info->extent_ins_mutex);
+        while (1) {
+                ret = find_first_extent_bit(&info->extent_ins, search, &start,
+                                            &end, EXTENT_WRITEBACK);
+                if (ret) {
+                        if (skipped && all && !num_inserts) {
+                                skipped = 0;
+                                search = 0;
+                                continue;
+                        }
+                        mutex_unlock(&info->extent_ins_mutex);
+                        break;
+                }
+                ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
+                if (!ret) {
+                        skipped = 1;
+                        search = end + 1;
+                        if (need_resched()) {
+                                mutex_unlock(&info->extent_ins_mutex);
+                                cond_resched();
+                                mutex_lock(&info->extent_ins_mutex);
+                        }
+                        continue;
+                }
+                ret = get_state_private(&info->extent_ins, start, &priv);
+                BUG_ON(ret);
+                extent_op = (struct pending_extent_op *)(unsigned long) priv;
+                if (extent_op->type == PENDING_EXTENT_INSERT) {
+                        num_inserts++;
+                        list_add_tail(&extent_op->list, &insert_list);
+                        search = end + 1;
+                        if (num_inserts == max_inserts) {
+                                mutex_unlock(&info->extent_ins_mutex);
+                                break;
+                        }
+                } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
+                        list_add_tail(&extent_op->list, &update_list);
+                        search = end + 1;
+                } else {
+                        BUG();
+                }
+        }
+        /*
+         * process the update list, clear the writeback bit for it, and if
+         * somebody marked this thing for deletion then just unlock it and be
+         * done, the free_extents will handle it
+         */
+        mutex_lock(&info->extent_ins_mutex);
+        list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
+                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+                                  extent_op->bytenr + extent_op->num_bytes - 1,
+                                  EXTENT_WRITEBACK, GFP_NOFS);
+                if (extent_op->del) {
+                        list_del_init(&extent_op->list);
+                        unlock_extent(&info->extent_ins, extent_op->bytenr,
+                                      extent_op->bytenr + extent_op->num_bytes
+                                      - 1, GFP_NOFS);
+                        kfree(extent_op);
+                }
+        }
+        mutex_unlock(&info->extent_ins_mutex);
+        /*
+         * still have things left on the update list, go ahead an update
+         * everything
+         */
+        if (!list_empty(&update_list)) {
+                ret = update_backrefs(trans, extent_root, path, &update_list);
+                BUG_ON(ret);
+        }
+        /*
+         * if no inserts need to be done, but we skipped some extents and we
+         * need to make sure everything is cleaned then reset everything and
+         * go back to the beginning
+         */
+        if (!num_inserts && all && skipped) {
+                search = 0;
+                skipped = 0;
+                INIT_LIST_HEAD(&update_list);
+                INIT_LIST_HEAD(&insert_list);
+                goto again;
+        } else if (!num_inserts) {
+                goto out;
+        }
+        /*
+         * process the insert extents list.  Again if we are deleting this
+         * extent, then just unlock it, pin down the bytes if need be, and be
+         * done with it.  Saves us from having to actually insert the extent
+         * into the tree and then subsequently come along and delete it
+         */
+        mutex_lock(&info->extent_ins_mutex);
+        list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
+                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+                                  extent_op->bytenr + extent_op->num_bytes - 1,
+                                  EXTENT_WRITEBACK, GFP_NOFS);
+                if (extent_op->del) {
+                        u64 used;
+                        list_del_init(&extent_op->list);
+                        unlock_extent(&info->extent_ins, extent_op->bytenr,
+                                      extent_op->bytenr + extent_op->num_bytes
+                                      - 1, GFP_NOFS);
+                        mutex_lock(&extent_root->fs_info->pinned_mutex);
+                        ret = pin_down_bytes(trans, extent_root,
+                                             extent_op->bytenr,
+                                             extent_op->num_bytes, 0);
+                        mutex_unlock(&extent_root->fs_info->pinned_mutex);
+                        spin_lock(&info->delalloc_lock);
+                        used = btrfs_super_bytes_used(&info->super_copy);
+                        btrfs_set_super_bytes_used(&info->super_copy,
+                                        used - extent_op->num_bytes);
+                        used = btrfs_root_used(&extent_root->root_item);
+                        btrfs_set_root_used(&extent_root->root_item,
+                                        used - extent_op->num_bytes);
+                        spin_unlock(&info->delalloc_lock);
+                        ret = update_block_group(trans, extent_root,
+                                                 extent_op->bytenr,
+                                                 extent_op->num_bytes,
+                                                 0, ret > 0);
+                        BUG_ON(ret);
+                        kfree(extent_op);
+                        num_inserts--;
+                }
+        }
+        mutex_unlock(&info->extent_ins_mutex);
+        ret = insert_extents(trans, extent_root, path, &insert_list,
+                             num_inserts);
+        BUG_ON(ret);
+        /*
+         * if we broke out of the loop in order to insert stuff because we hit
+         * the maximum number of inserts at a time we can handle, then loop
+         * back and pick up where we left off
+         */
+        if (num_inserts == max_inserts) {
+                INIT_LIST_HEAD(&insert_list);
+                INIT_LIST_HEAD(&update_list);
+                num_inserts = 0;
+                goto again;
+        }
+        /*
+         * again, if we need to make absolutely sure there are no more pending
+         * extent operations left and we know that we skipped some, go back to
+         * the beginning and do it all again
+         */
+        if (all && skipped) {
+                INIT_LIST_HEAD(&insert_list);
+                INIT_LIST_HEAD(&update_list);
+                search = 0;
+                skipped = 0;
+                num_inserts = 0;
+                goto again;
+        }
+out:
+        btrfs_free_path(path);
+        return 0;
+}
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          u64 bytenr, u64 num_bytes, int is_data)
+{
+        int err = 0;
+        struct extent_buffer *buf;
+        if (is_data)
+                goto pinit;
+        buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+        if (!buf)
+                goto pinit;
+        /* we can reuse a block if it hasn't been written
+         * and it is from this transaction.  We can't
+         * reuse anything from the tree log root because
+         * it has tiny sub-transactions.
+         */
+        if (btrfs_buffer_uptodate(buf, 0) &&
+            btrfs_try_tree_lock(buf)) {
+                u64 header_owner = btrfs_header_owner(buf);
+                u64 header_transid = btrfs_header_generation(buf);
+                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+                    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+                    header_transid == trans->transid &&
+                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+                        clean_tree_block(NULL, root, buf);
+                        btrfs_tree_unlock(buf);
+                        free_extent_buffer(buf);
+                        return 1;
+                }
+                btrfs_tree_unlock(buf);
+        }
+        free_extent_buffer(buf);
+pinit:
+        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+        BUG_ON(err < 0);
+        return 0;
+}
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __free_extent(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         u64 bytenr, u64 num_bytes, u64 parent,
+                         u64 root_objectid, u64 ref_generation,
+                         u64 owner_objectid, int pin, int mark_free)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_fs_info *info = root->fs_info;
+        struct btrfs_root *extent_root = info->extent_root;
+        struct extent_buffer *leaf;
+        int ret;
+        int extent_slot = 0;
+        int found_extent = 0;
+        int num_to_del = 1;
+        struct btrfs_extent_item *ei;
+        u32 refs;
+        key.objectid = bytenr;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+        key.offset = num_bytes;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 1;
+        ret = lookup_extent_backref(trans, extent_root, path,
+                                    bytenr, parent, root_objectid,
+                                    ref_generation, owner_objectid, 1);
+        if (ret == 0) {
+                struct btrfs_key found_key;
+                extent_slot = path->slots[0];
+                while (extent_slot > 0) {
+                        extent_slot--;
+                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                              extent_slot);
+                        if (found_key.objectid != bytenr)
+                                break;
+                        if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+                            found_key.offset == num_bytes) {
+                                found_extent = 1;
+                                break;
+                        }
+                        if (path->slots[0] - extent_slot > 5)
+                                break;
+                }
+                if (!found_extent) {
+                        ret = remove_extent_backref(trans, extent_root, path);
+                        BUG_ON(ret);
+                        btrfs_release_path(extent_root, path);
+                        ret = btrfs_search_slot(trans, extent_root,
+                                                &key, path, -1, 1);
+                        if (ret) {
+                                printk(KERN_ERR "umm, got %d back from search"
+                                       ", was looking for %llu\n", ret,
+                                       (unsigned long long)bytenr);
+                                btrfs_print_leaf(extent_root, path->nodes[0]);
+                        }
+                        BUG_ON(ret);
+                        extent_slot = path->slots[0];
+                }
+        } else {
+                btrfs_print_leaf(extent_root, path->nodes[0]);
+                WARN_ON(1);
+                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
+                       "root %llu gen %llu owner %llu\n",
+                       (unsigned long long)bytenr,
+                       (unsigned long long)root_objectid,
+                       (unsigned long long)ref_generation,
+                       (unsigned long long)owner_objectid);
+        }
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, extent_slot,
+                            struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        BUG_ON(refs == 0);
+        refs -= 1;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        btrfs_mark_buffer_dirty(leaf);
+        if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+                struct btrfs_extent_ref *ref;
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_ref);
+                BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+                /* if the back ref and the extent are next to each other
+                 * they get deleted below in one shot
+                 */
+                path->slots[0] = extent_slot;
+                num_to_del = 2;
+        } else if (found_extent) {
+                /* otherwise delete the extent back ref */
+                ret = remove_extent_backref(trans, extent_root, path);
+                BUG_ON(ret);
+                /* if refs are 0, we need to setup the path for deletion */
+                if (refs == 0) {
+                        btrfs_release_path(extent_root, path);
+                        ret = btrfs_search_slot(trans, extent_root, &key, path,
+                                                -1, 1);
+                        BUG_ON(ret);
+                }
+        }
+        if (refs == 0) {
+                u64 super_used;
+                u64 root_used;
+                if (pin) {
+                        mutex_lock(&root->fs_info->pinned_mutex);
+                        ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+                                owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+                        mutex_unlock(&root->fs_info->pinned_mutex);
+                        if (ret > 0)
+                                mark_free = 1;
+                        BUG_ON(ret < 0);
+                }
+                /* block accounting for super block */
+                spin_lock(&info->delalloc_lock);
+                super_used = btrfs_super_bytes_used(&info->super_copy);
+                btrfs_set_super_bytes_used(&info->super_copy,
+                                           super_used - num_bytes);
+                /* block accounting for root item */
+                root_used = btrfs_root_used(&root->root_item);
+                btrfs_set_root_used(&root->root_item,
+                                           root_used - num_bytes);
+                spin_unlock(&info->delalloc_lock);
+                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+                                      num_to_del);
+                BUG_ON(ret);
+                btrfs_release_path(extent_root, path);
+                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+                        BUG_ON(ret);
+                }
+                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+                                         mark_free);
+                BUG_ON(ret);
+        }
+        btrfs_free_path(path);
+        finish_current_insert(trans, extent_root, 0);
+        return ret;
+}
+/*
+ * find all the blocks marked as pending in the radix tree and remove
+ * them from the extent map
+ */
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root, int all)
+{
+        int ret;
+        int err = 0;
+        u64 start;
+        u64 end;
+        u64 priv;
+        u64 search = 0;
+        int nr = 0, skipped = 0;
+        struct extent_io_tree *pending_del;
+        struct extent_io_tree *extent_ins;
+        struct pending_extent_op *extent_op;
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct list_head delete_list;
+        INIT_LIST_HEAD(&delete_list);
+        extent_ins = &extent_root->fs_info->extent_ins;
+        pending_del = &extent_root->fs_info->pending_del;
+again:
+        mutex_lock(&info->extent_ins_mutex);
+        while (1) {
+                ret = find_first_extent_bit(pending_del, search, &start, &end,
+                                            EXTENT_WRITEBACK);
+                if (ret) {
+                        if (all && skipped && !nr) {
+                                search = 0;
+                                continue;
+                        }
+                        mutex_unlock(&info->extent_ins_mutex);
+                        break;
+                }
+                ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
+                if (!ret) {
+                        search = end+1;
+                        skipped = 1;
+                        if (need_resched()) {
+                                mutex_unlock(&info->extent_ins_mutex);
+                                cond_resched();
+                                mutex_lock(&info->extent_ins_mutex);
+                        }
+                        continue;
+                }
+                BUG_ON(ret < 0);
+                ret = get_state_private(pending_del, start, &priv);
+                BUG_ON(ret);
+                extent_op = (struct pending_extent_op *)(unsigned long)priv;
+                clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
+                                  GFP_NOFS);
+                if (!test_range_bit(extent_ins, start, end,
+                                    EXTENT_WRITEBACK, 0)) {
+                        list_add_tail(&extent_op->list, &delete_list);
+                        nr++;
+                } else {
+                        kfree(extent_op);
+                        ret = get_state_private(&info->extent_ins, start,
+                                                &priv);
+                        BUG_ON(ret);
+                        extent_op = (struct pending_extent_op *)
+                                                (unsigned long)priv;
+                        clear_extent_bits(&info->extent_ins, start, end,
+                                          EXTENT_WRITEBACK, GFP_NOFS);
+                        if (extent_op->type == PENDING_BACKREF_UPDATE) {
+                                list_add_tail(&extent_op->list, &delete_list);
+                                search = end + 1;
+                                nr++;
+                                continue;
+                        }
+                        mutex_lock(&extent_root->fs_info->pinned_mutex);
+                        ret = pin_down_bytes(trans, extent_root, start,
+                                             end + 1 - start, 0);
+                        mutex_unlock(&extent_root->fs_info->pinned_mutex);
+                        ret = update_block_group(trans, extent_root, start,
+                                                end + 1 - start, 0, ret > 0);
+                        unlock_extent(extent_ins, start, end, GFP_NOFS);
+                        BUG_ON(ret);
+                        kfree(extent_op);
+                }
+                if (ret)
+                        err = ret;
+                search = end + 1;
+                if (need_resched()) {
+                        mutex_unlock(&info->extent_ins_mutex);
+                        cond_resched();
+                        mutex_lock(&info->extent_ins_mutex);
+                }
+        }
+        if (nr) {
+                ret = free_extents(trans, extent_root, &delete_list);
+                BUG_ON(ret);
+        }
+        if (all && skipped) {
+                INIT_LIST_HEAD(&delete_list);
+                search = 0;
+                nr = 0;
+                goto again;
+        }
+        return err;
+}
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               u64 bytenr, u64 num_bytes, u64 parent,
+                               u64 root_objectid, u64 ref_generation,
+                               u64 owner_objectid, int pin)
+{
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        int pending_ret;
+        int ret;
+        WARN_ON(num_bytes < root->sectorsize);
+        if (root == extent_root) {
+                struct pending_extent_op *extent_op = NULL;
+                mutex_lock(&root->fs_info->extent_ins_mutex);
+                if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+                                bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+                        u64 priv;
+                        ret = get_state_private(&root->fs_info->extent_ins,
+                                                bytenr, &priv);
+                        BUG_ON(ret);
+                        extent_op = (struct pending_extent_op *)
+                                                (unsigned long)priv;
+                        extent_op->del = 1;
+                        if (extent_op->type == PENDING_EXTENT_INSERT) {
+                                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                                return 0;
+                        }
+                }
+                if (extent_op) {
+                        ref_generation = extent_op->orig_generation;
+                        parent = extent_op->orig_parent;
+                }
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                BUG_ON(!extent_op);
+                extent_op->type = PENDING_EXTENT_DELETE;
+                extent_op->bytenr = bytenr;
+                extent_op->num_bytes = num_bytes;
+                extent_op->parent = parent;
+                extent_op->orig_parent = parent;
+                extent_op->generation = ref_generation;
+                extent_op->orig_generation = ref_generation;
+                extent_op->level = (int)owner_objectid;
+                INIT_LIST_HEAD(&extent_op->list);
+                extent_op->del = 0;
+                set_extent_bits(&root->fs_info->pending_del,
+                                bytenr, bytenr + num_bytes - 1,
+                                EXTENT_WRITEBACK, GFP_NOFS);
+                set_state_private(&root->fs_info->pending_del,
+                                  bytenr, (unsigned long)extent_op);
+                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                return 0;
+        }
+        /* if metadata always pin */
+        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+                        struct btrfs_block_group_cache *cache;
+                        /* btrfs_free_reserved_extent */
+                        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+                        BUG_ON(!cache);
+                        btrfs_add_free_space(cache, bytenr, num_bytes);
+                        put_block_group(cache);
+                        update_reserved_extents(root, bytenr, num_bytes, 0);
+                        return 0;
+                }
+                pin = 1;
+        }
+        /* if data pin when any transaction has committed this */
+        if (ref_generation != trans->transid)
+                pin = 1;
+        ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+                            root_objectid, ref_generation,
+                            owner_objectid, pin, pin == 0);
+        finish_current_insert(trans, root->fs_info->extent_root, 0);
+        pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
+        return ret ? ret : pending_ret;
+}
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      u64 bytenr, u64 num_bytes, u64 parent,
+                      u64 root_objectid, u64 ref_generation,
+                      u64 owner_objectid, int pin)
+{
+        int ret;
+        ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
+                                  root_objectid, ref_generation,
+                                  owner_objectid, pin);
+        return ret;
+}
+static u64 stripe_align(struct btrfs_root *root, u64 val)
+{
+        u64 mask = ((u64)root->stripesize - 1);
+        u64 ret = (val + mask) & ~mask;
+        return ret;
+}
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == block start
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == number of blocks
+ * Any available blocks before search_start are skipped.
+ */
+static noinline int find_free_extent(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *orig_root,
+                                     u64 num_bytes, u64 empty_size,
+                                     u64 search_start, u64 search_end,
+                                     u64 hint_byte, struct btrfs_key *ins,
+                                     u64 exclude_start, u64 exclude_nr,
+                                     int data)
+{
+        int ret = 0;
+        struct btrfs_root *root = orig_root->fs_info->extent_root;
+        u64 total_needed = num_bytes;
+        u64 *last_ptr = NULL;
+        u64 last_wanted = 0;
+        struct btrfs_block_group_cache *block_group = NULL;
+        int chunk_alloc_done = 0;
+        int empty_cluster = 2 * 1024 * 1024;
+        int allowed_chunk_alloc = 0;
+        struct list_head *head = NULL, *cur = NULL;
+        int loop = 0;
+        int extra_loop = 0;
+        struct btrfs_space_info *space_info;
+        WARN_ON(num_bytes < root->sectorsize);
+        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+        ins->objectid = 0;
+        ins->offset = 0;
+        if (orig_root->ref_cows || empty_size)
+                allowed_chunk_alloc = 1;
+        if (data & BTRFS_BLOCK_GROUP_METADATA) {
+                last_ptr = &root->fs_info->last_alloc;
+                empty_cluster = 64 * 1024;
+        }
+        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
+                last_ptr = &root->fs_info->last_data_alloc;
+        if (last_ptr) {
+                if (*last_ptr) {
+                        hint_byte = *last_ptr;
+                        last_wanted = *last_ptr;
+                } else
+                        empty_size += empty_cluster;
+        } else {
+                empty_cluster = 0;
+        }
+        search_start = max(search_start, first_logical_byte(root, 0));
+        search_start = max(search_start, hint_byte);
+        if (last_wanted && search_start != last_wanted) {
+                last_wanted = 0;
+                empty_size += empty_cluster;
+        }
+        total_needed += empty_size;
+        block_group = btrfs_lookup_block_group(root->fs_info, search_start);
+        if (!block_group)
+                block_group = btrfs_lookup_first_block_group(root->fs_info,
+                                                             search_start);
+        space_info = __find_space_info(root->fs_info, data);
+        down_read(&space_info->groups_sem);
+        while (1) {
+                struct btrfs_free_space *free_space;
+                /*
+                 * the only way this happens if our hint points to a block
+                 * group thats not of the proper type, while looping this
+                 * should never happen
+                 */
+                if (empty_size)
+                        extra_loop = 1;
+                if (!block_group)
+                        goto new_group_no_lock;
+                if (unlikely(!block_group->cached)) {
+                        mutex_lock(&block_group->cache_mutex);
+                        ret = cache_block_group(root, block_group);
+                        mutex_unlock(&block_group->cache_mutex);
+                        if (ret)
+                                break;
+                }
+                mutex_lock(&block_group->alloc_mutex);
+                if (unlikely(!block_group_bits(block_group, data)))
+                        goto new_group;
+                if (unlikely(block_group->ro))
+                        goto new_group;
+                free_space = btrfs_find_free_space(block_group, search_start,
+                                                   total_needed);
+                if (free_space) {
+                        u64 start = block_group->key.objectid;
+                        u64 end = block_group->key.objectid +
+                                block_group->key.offset;
+                        search_start = stripe_align(root, free_space->offset);
+                        /* move on to the next group */
+                        if (search_start + num_bytes >= search_end)
+                                goto new_group;
+                        /* move on to the next group */
+                        if (search_start + num_bytes > end)
+                                goto new_group;
+                        if (last_wanted && search_start != last_wanted) {
+                                total_needed += empty_cluster;
+                                empty_size += empty_cluster;
+                                last_wanted = 0;
+                                /*
+                                 * if search_start is still in this block group
+                                 * then we just re-search this block group
+                                 */
+                                if (search_start >= start &&
+                                    search_start < end) {
+                                        mutex_unlock(&block_group->alloc_mutex);
+                                        continue;
+                                }
+                                /* else we go to the next block group */
+                                goto new_group;
+                        }
+                        if (exclude_nr > 0 &&
+                            (search_start + num_bytes > exclude_start &&
+                             search_start < exclude_start + exclude_nr)) {
+                                search_start = exclude_start + exclude_nr;
+                                /*
+                                 * if search_start is still in this block group
+                                 * then we just re-search this block group
+                                 */
+                                if (search_start >= start &&
+                                    search_start < end) {
+                                        mutex_unlock(&block_group->alloc_mutex);
+                                        last_wanted = 0;
+                                        continue;
+                                }
+                                /* else we go to the next block group */
+                                goto new_group;
+                        }
+                        ins->objectid = search_start;
+                        ins->offset = num_bytes;
+                        btrfs_remove_free_space_lock(block_group, search_start,
+                                                     num_bytes);
+                        /* we are all good, lets return */
+                        mutex_unlock(&block_group->alloc_mutex);
+                        break;
+                }
+new_group:
+                mutex_unlock(&block_group->alloc_mutex);
+                put_block_group(block_group);
+                block_group = NULL;
+new_group_no_lock:
+                /* don't try to compare new allocations against the
+                 * last allocation any more
+                 */
+                last_wanted = 0;
+                /*
+                 * Here's how this works.
+                 * loop == 0: we were searching a block group via a hint
+                 *              and didn't find anything, so we start at
+                 *              the head of the block groups and keep searching
+                 * loop == 1: we're searching through all of the block groups
+                 *              if we hit the head again we have searched
+                 *              all of the block groups for this space and we
+                 *              need to try and allocate, if we cant error out.
+                 * loop == 2: we allocated more space and are looping through
+                 *              all of the block groups again.
+                 */
+                if (loop == 0) {
+                        head = &space_info->block_groups;
+                        cur = head->next;
+                        loop++;
+                } else if (loop == 1 && cur == head) {
+                        int keep_going;
+                        /* at this point we give up on the empty_size
+                         * allocations and just try to allocate the min
+                         * space.
+                         *
+                         * The extra_loop field was set if an empty_size
+                         * allocation was attempted above, and if this
+                         * is try we need to try the loop again without
+                         * the additional empty_size.
+                         */
+                        total_needed -= empty_size;
+                        empty_size = 0;
+                        keep_going = extra_loop;
+                        loop++;
+                        if (allowed_chunk_alloc && !chunk_alloc_done) {
+                                up_read(&space_info->groups_sem);
+                                ret = do_chunk_alloc(trans, root, num_bytes +
+                                                     2 * 1024 * 1024, data, 1);
+                                down_read(&space_info->groups_sem);
+                                if (ret < 0)
+                                        goto loop_check;
+                                head = &space_info->block_groups;
+                                /*
+                                 * we've allocated a new chunk, keep
+                                 * trying
+                                 */
+                                keep_going = 1;
+                                chunk_alloc_done = 1;
+                        } else if (!allowed_chunk_alloc) {
+                                space_info->force_alloc = 1;
+                        }
+loop_check:
+                        if (keep_going) {
+                                cur = head->next;
+                                extra_loop = 0;
+                        } else {
+                                break;
+                        }
+                } else if (cur == head) {
+                        break;
+                }
+                block_group = list_entry(cur, struct btrfs_block_group_cache,
+                                         list);
+                atomic_inc(&block_group->count);
+                search_start = block_group->key.objectid;
+                cur = cur->next;
+        }
+        /* we found what we needed */
+        if (ins->objectid) {
+                if (!(data & BTRFS_BLOCK_GROUP_DATA))
+                        trans->block_group = block_group->key.objectid;
+                if (last_ptr)
+                        *last_ptr = ins->objectid + ins->offset;
+                ret = 0;
+        } else if (!ret) {
+                printk(KERN_ERR "btrfs searching for %llu bytes, "
+                       "num_bytes %llu, loop %d, allowed_alloc %d\n",
+                       (unsigned long long)total_needed,
+                       (unsigned long long)num_bytes,
+                       loop, allowed_chunk_alloc);
+                ret = -ENOSPC;
+        }
+        if (block_group)
+                put_block_group(block_group);
+        up_read(&space_info->groups_sem);
+        return ret;
+}
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+{
+        struct btrfs_block_group_cache *cache;
+        struct list_head *l;
+        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+               (unsigned long long)(info->total_bytes - info->bytes_used -
+                                    info->bytes_pinned - info->bytes_reserved),
+               (info->full) ? "" : "not ");
+        down_read(&info->groups_sem);
+        list_for_each(l, &info->block_groups) {
+                cache = list_entry(l, struct btrfs_block_group_cache, list);
+                spin_lock(&cache->lock);
+                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
+                       "%llu pinned %llu reserved\n",
+                       (unsigned long long)cache->key.objectid,
+                       (unsigned long long)cache->key.offset,
+                       (unsigned long long)btrfs_block_group_used(&cache->item),
+                       (unsigned long long)cache->pinned,
+                       (unsigned long long)cache->reserved);
+                btrfs_dump_free_space(cache, bytes);
+                spin_unlock(&cache->lock);
+        }
+        up_read(&info->groups_sem);
+}
+static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  u64 num_bytes, u64 min_alloc_size,
+                                  u64 empty_size, u64 hint_byte,
+                                  u64 search_end, struct btrfs_key *ins,
+                                  u64 data)
+{
+        int ret;
+        u64 search_start = 0;
+        u64 alloc_profile;
+        struct btrfs_fs_info *info = root->fs_info;
+        if (data) {
+                alloc_profile = info->avail_data_alloc_bits &
+                        info->data_alloc_profile;
+                data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+        } else if (root == root->fs_info->chunk_root) {
+                alloc_profile = info->avail_system_alloc_bits &
+                        info->system_alloc_profile;
+                data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+        } else {
+                alloc_profile = info->avail_metadata_alloc_bits &
+                        info->metadata_alloc_profile;
+                data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+        }
+again:
+        data = btrfs_reduce_alloc_profile(root, data);
+        /*
+         * the only place that sets empty_size is btrfs_realloc_node, which
+         * is not called recursively on allocations
+         */
+        if (empty_size || root->ref_cows) {
+                if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
+                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                     2 * 1024 * 1024,
+                                     BTRFS_BLOCK_GROUP_METADATA |
+                                     (info->metadata_alloc_profile &
+                                      info->avail_metadata_alloc_bits), 0);
+                }
+                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                     num_bytes + 2 * 1024 * 1024, data, 0);
+        }
+        WARN_ON(num_bytes < root->sectorsize);
+        ret = find_free_extent(trans, root, num_bytes, empty_size,
+                               search_start, search_end, hint_byte, ins,
+                               trans->alloc_exclude_start,
+                               trans->alloc_exclude_nr, data);
+        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
+                num_bytes = num_bytes >> 1;
+                num_bytes = num_bytes & ~(root->sectorsize - 1);
+                num_bytes = max(num_bytes, min_alloc_size);
+                do_chunk_alloc(trans, root->fs_info->extent_root,
+                               num_bytes, data, 1);
+                goto again;
+        }
+        if (ret) {
+                struct btrfs_space_info *sinfo;
+                sinfo = __find_space_info(root->fs_info, data);
+                printk(KERN_ERR "btrfs allocation failed flags %llu, "
+                       "wanted %llu\n", (unsigned long long)data,
+                       (unsigned long long)num_bytes);
+                dump_space_info(sinfo, num_bytes);
+                BUG();
+        }
+        return ret;
+}
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+        struct btrfs_block_group_cache *cache;
+        int ret = 0;
+        cache = btrfs_lookup_block_group(root->fs_info, start);
+        if (!cache) {
+                printk(KERN_ERR "Unable to find block group for %llu\n",
+                       (unsigned long long)start);
+                return -ENOSPC;
+        }
+        ret = btrfs_discard_extent(root, start, len);
+        btrfs_add_free_space(cache, start, len);
+        put_block_group(cache);
+        update_reserved_extents(root, start, len, 0);
+        return ret;
+}
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  u64 num_bytes, u64 min_alloc_size,
+                                  u64 empty_size, u64 hint_byte,
+                                  u64 search_end, struct btrfs_key *ins,
+                                  u64 data)
+{
+        int ret;
+        ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+                                     empty_size, hint_byte, search_end, ins,
+                                     data);
+        update_reserved_extents(root, ins->objectid, ins->offset, 1);
+        return ret;
+}
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                         struct btrfs_root *root, u64 parent,
+                                         u64 root_objectid, u64 ref_generation,
+                                         u64 owner, struct btrfs_key *ins)
+{
+        int ret;
+        int pending_ret;
+        u64 super_used;
+        u64 root_used;
+        u64 num_bytes = ins->offset;
+        u32 sizes[2];
+        struct btrfs_fs_info *info = root->fs_info;
+        struct btrfs_root *extent_root = info->extent_root;
+        struct btrfs_extent_item *extent_item;
+        struct btrfs_extent_ref *ref;
+        struct btrfs_path *path;
+        struct btrfs_key keys[2];
+        if (parent == 0)
+                parent = ins->objectid;
+        /* block accounting for super block */
+        spin_lock(&info->delalloc_lock);
+        super_used = btrfs_super_bytes_used(&info->super_copy);
+        btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
+        /* block accounting for root item */
+        root_used = btrfs_root_used(&root->root_item);
+        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
+        spin_unlock(&info->delalloc_lock);
+        if (root == extent_root) {
+                struct pending_extent_op *extent_op;
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                BUG_ON(!extent_op);
+                extent_op->type = PENDING_EXTENT_INSERT;
+                extent_op->bytenr = ins->objectid;
+                extent_op->num_bytes = ins->offset;
+                extent_op->parent = parent;
+                extent_op->orig_parent = 0;
+                extent_op->generation = ref_generation;
+                extent_op->orig_generation = 0;
+                extent_op->level = (int)owner;
+                INIT_LIST_HEAD(&extent_op->list);
+                extent_op->del = 0;
+                mutex_lock(&root->fs_info->extent_ins_mutex);
+                set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
+                                ins->objectid + ins->offset - 1,
+                                EXTENT_WRITEBACK, GFP_NOFS);
+                set_state_private(&root->fs_info->extent_ins,
+                                  ins->objectid, (unsigned long)extent_op);
+                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                goto update_block;
+        }
+        memcpy(&keys[0], ins, sizeof(*ins));
+        keys[1].objectid = ins->objectid;
+        keys[1].type = BTRFS_EXTENT_REF_KEY;
+        keys[1].offset = parent;
+        sizes[0] = sizeof(*extent_item);
+        sizes[1] = sizeof(*ref);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
+                                       sizes, 2);
+        BUG_ON(ret);
+        extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                     struct btrfs_extent_item);
+        btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+                             struct btrfs_extent_ref);
+        btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
+        btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
+        btrfs_set_ref_objectid(path->nodes[0], ref, owner);
+        btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        trans->alloc_exclude_start = 0;
+        trans->alloc_exclude_nr = 0;
+        btrfs_free_path(path);
+        finish_current_insert(trans, extent_root, 0);
+        pending_ret = del_pending_extents(trans, extent_root, 0);
+        if (ret)
+                goto out;
+        if (pending_ret) {
+                ret = pending_ret;
+                goto out;
+        }
+update_block:
+        ret = update_block_group(trans, root, ins->objectid,
+                                 ins->offset, 1, 0);
+        if (ret) {
+                printk(KERN_ERR "btrfs update block group failed for %llu "
+                       "%llu\n", (unsigned long long)ins->objectid,
+                       (unsigned long long)ins->offset);
+                BUG();
+        }
+out:
+        return ret;
+}
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, u64 parent,
+                                u64 root_objectid, u64 ref_generation,
+                                u64 owner, struct btrfs_key *ins)
+{
+        int ret;
+        if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
+                return 0;
+        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+                                            ref_generation, owner, ins);
+        update_reserved_extents(root, ins->objectid, ins->offset, 0);
+        return ret;
+}
+/*
+ * this is used by the tree logging recovery code.  It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, u64 parent,
+                                u64 root_objectid, u64 ref_generation,
+                                u64 owner, struct btrfs_key *ins)
+{
+        int ret;
+        struct btrfs_block_group_cache *block_group;
+        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+        mutex_lock(&block_group->cache_mutex);
+        cache_block_group(root, block_group);
+        mutex_unlock(&block_group->cache_mutex);
+        ret = btrfs_remove_free_space(block_group, ins->objectid,
+                                      ins->offset);
+        BUG_ON(ret);
+        put_block_group(block_group);
+        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+                                            ref_generation, owner, ins);
+        return ret;
+}
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       u64 num_bytes, u64 parent, u64 min_alloc_size,
+                       u64 root_objectid, u64 ref_generation,
+                       u64 owner_objectid, u64 empty_size, u64 hint_byte,
+                       u64 search_end, struct btrfs_key *ins, u64 data)
+{
+        int ret;
+        ret = __btrfs_reserve_extent(trans, root, num_bytes,
+                                     min_alloc_size, empty_size, hint_byte,
+                                     search_end, ins, data);
+        BUG_ON(ret);
+        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+                ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+                                        root_objectid, ref_generation,
+                                        owner_objectid, ins);
+                BUG_ON(ret);
+        } else {
+                update_reserved_extents(root, ins->objectid, ins->offset, 1);
+        }
+        return ret;
+}
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+                                            struct btrfs_root *root,
+                                            u64 bytenr, u32 blocksize)
+{
+        struct extent_buffer *buf;
+        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        if (!buf)
+                return ERR_PTR(-ENOMEM);
+        btrfs_set_header_generation(buf, trans->transid);
+        btrfs_tree_lock(buf);
+        clean_tree_block(trans, root, buf);
+        btrfs_set_buffer_uptodate(buf);
+        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+                set_extent_dirty(&root->dirty_log_pages, buf->start,
+                         buf->start + buf->len - 1, GFP_NOFS);
+        } else {
+                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+                         buf->start + buf->len - 1, GFP_NOFS);
+        }
+        trans->blocks_used++;
+        return buf;
+}
+/*
+ * helper function to allocate a block for a given tree
+ * returns the tree buffer or NULL.
+ */
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root,
+                                             u32 blocksize, u64 parent,
+                                             u64 root_objectid,
+                                             u64 ref_generation,
+                                             int level,
+                                             u64 hint,
+                                             u64 empty_size)
+{
+        struct btrfs_key ins;
+        int ret;
+        struct extent_buffer *buf;
+        ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+                                 root_objectid, ref_generation, level,
+                                 empty_size, hint, (u64)-1, &ins, 0);
+        if (ret) {
+                BUG_ON(ret > 0);
+                return ERR_PTR(ret);
+        }
+        buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+        return buf;
+}
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, struct extent_buffer *leaf)
+{
+        u64 leaf_owner;
+        u64 leaf_generation;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        int i;
+        int nritems;
+        int ret;
+        BUG_ON(!btrfs_is_leaf(leaf));
+        nritems = btrfs_header_nritems(leaf);
+        leaf_owner = btrfs_header_owner(leaf);
+        leaf_generation = btrfs_header_generation(leaf);
+        for (i = 0; i < nritems; i++) {
+                u64 disk_bytenr;
+                cond_resched();
+                btrfs_item_key_to_cpu(leaf, &key, i);
+                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                /*
+                 * FIXME make sure to insert a trans record that
+                 * repeats the snapshot del on crash
+                 */
+                disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                if (disk_bytenr == 0)
+                        continue;
+                ret = __btrfs_free_extent(trans, root, disk_bytenr,
+                                btrfs_file_extent_disk_num_bytes(leaf, fi),
+                                leaf->start, leaf_owner, leaf_generation,
+                                key.objectid, 0);
+                BUG_ON(ret);
+                atomic_inc(&root->fs_info->throttle_gen);
+                wake_up(&root->fs_info->transaction_throttle);
+                cond_resched();
+        }
+        return 0;
+}
+static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_leaf_ref *ref)
+{
+        int i;
+        int ret;
+        struct btrfs_extent_info *info = ref->extents;
+        for (i = 0; i < ref->nritems; i++) {
+                ret = __btrfs_free_extent(trans, root, info->bytenr,
+                                          info->num_bytes, ref->bytenr,
+                                          ref->owner, ref->generation,
+                                          info->objectid, 0);
+                atomic_inc(&root->fs_info->throttle_gen);
+                wake_up(&root->fs_info->transaction_throttle);
+                cond_resched();
+                BUG_ON(ret);
+                info++;
+        }
+        return 0;
+}
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+                                     u64 len, u32 *refs)
+{
+        int ret;
+        ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+        BUG_ON(ret);
+#if 0 /* some debugging code in case we see problems here */
+        /* if the refs count is one, it won't get increased again.  But
+         * if the ref count is > 1, someone may be decreasing it at
+         * the same time we are.
+         */
+        if (*refs != 1) {
+                struct extent_buffer *eb = NULL;
+                eb = btrfs_find_create_tree_block(root, start, len);
+                if (eb)
+                        btrfs_tree_lock(eb);
+                mutex_lock(&root->fs_info->alloc_mutex);
+                ret = lookup_extent_ref(NULL, root, start, len, refs);
+                BUG_ON(ret);
+                mutex_unlock(&root->fs_info->alloc_mutex);
+                if (eb) {
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
+                }
+                if (*refs == 1) {
+                        printk(KERN_ERR "btrfs block %llu went down to one "
+                               "during drop_snap\n", (unsigned long long)start);
+                }
+        }
+#endif
+        cond_resched();
+        return ret;
+}
+/*
+ * helper function for drop_snapshot, this walks down the tree dropping ref
+ * counts as it goes.
+ */
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path, int *level)
+{
+        u64 root_owner;
+        u64 root_gen;
+        u64 bytenr;
+        u64 ptr_gen;
+        struct extent_buffer *next;
+        struct extent_buffer *cur;
+        struct extent_buffer *parent;
+        struct btrfs_leaf_ref *ref;
+        u32 blocksize;
+        int ret;
+        u32 refs;
+        WARN_ON(*level < 0);
+        WARN_ON(*level >= BTRFS_MAX_LEVEL);
+        ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+                                path->nodes[*level]->len, &refs);
+        BUG_ON(ret);
+        if (refs > 1)
+                goto out;
+        /*
+         * walk down to the last node level and free all the leaves
+         */
+        while (*level >= 0) {
+                WARN_ON(*level < 0);
+                WARN_ON(*level >= BTRFS_MAX_LEVEL);
+                cur = path->nodes[*level];
+                if (btrfs_header_level(cur) != *level)
+                        WARN_ON(1);
+                if (path->slots[*level] >=
+                    btrfs_header_nritems(cur))
+                        break;
+                if (*level == 0) {
+                        ret = btrfs_drop_leaf_ref(trans, root, cur);
+                        BUG_ON(ret);
+                        break;
+                }
+                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+                blocksize = btrfs_level_size(root, *level - 1);
+                ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+                BUG_ON(ret);
+                if (refs != 1) {
+                        parent = path->nodes[*level];
+                        root_owner = btrfs_header_owner(parent);
+                        root_gen = btrfs_header_generation(parent);
+                        path->slots[*level]++;
+                        ret = __btrfs_free_extent(trans, root, bytenr,
+                                                blocksize, parent->start,
+                                                root_owner, root_gen,
+                                                *level - 1, 1);
+                        BUG_ON(ret);
+                        atomic_inc(&root->fs_info->throttle_gen);
+                        wake_up(&root->fs_info->transaction_throttle);
+                        cond_resched();
+                        continue;
+                }
+                /*
+                 * at this point, we have a single ref, and since the
+                 * only place referencing this extent is a dead root
+                 * the reference count should never go higher.
+                 * So, we don't need to check it again
+                 */
+                if (*level == 1) {
+                        ref = btrfs_lookup_leaf_ref(root, bytenr);
+                        if (ref && ref->generation != ptr_gen) {
+                                btrfs_free_leaf_ref(root, ref);
+                                ref = NULL;
+                        }
+                        if (ref) {
+                                ret = cache_drop_leaf_ref(trans, root, ref);
+                                BUG_ON(ret);
+                                btrfs_remove_leaf_ref(root, ref);
+                                btrfs_free_leaf_ref(root, ref);
+                                *level = 0;
+                                break;
+                        }
+                }
+                next = btrfs_find_tree_block(root, bytenr, blocksize);
+                if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
+                        free_extent_buffer(next);
+                        next = read_tree_block(root, bytenr, blocksize,
+                                               ptr_gen);
+                        cond_resched();
+#if 0
+                        /*
+                         * this is a debugging check and can go away
+                         * the ref should never go all the way down to 1
+                         * at this point
+                         */
+                        ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
+                                                &refs);
+                        BUG_ON(ret);
+                        WARN_ON(refs != 1);
+#endif
+                }
+                WARN_ON(*level <= 0);
+                if (path->nodes[*level-1])
+                        free_extent_buffer(path->nodes[*level-1]);
+                path->nodes[*level-1] = next;
+                *level = btrfs_header_level(next);
+                path->slots[*level] = 0;
+                cond_resched();
+        }
+out:
+        WARN_ON(*level < 0);
+        WARN_ON(*level >= BTRFS_MAX_LEVEL);
+        if (path->nodes[*level] == root->node) {
+                parent = path->nodes[*level];
+                bytenr = path->nodes[*level]->start;
+        } else {
+                parent = path->nodes[*level + 1];
+                bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+        }
+        blocksize = btrfs_level_size(root, *level);
+        root_owner = btrfs_header_owner(parent);
+        root_gen = btrfs_header_generation(parent);
+        ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+                                  parent->start, root_owner, root_gen,
+                                  *level, 1);
+        free_extent_buffer(path->nodes[*level]);
+        path->nodes[*level] = NULL;
+        *level += 1;
+        BUG_ON(ret);
+        cond_resched();
+        return 0;
+}
+/*
+ * helper function for drop_subtree, this function is similar to
+ * walk_down_tree. The main difference is that it checks reference
+ * counts while tree blocks are locked.
+ */
+static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path, int *level)
+{
+        struct extent_buffer *next;
+        struct extent_buffer *cur;
+        struct extent_buffer *parent;
+        u64 bytenr;
+        u64 ptr_gen;
+        u32 blocksize;
+        u32 refs;
+        int ret;
+        cur = path->nodes[*level];
+        ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+                                      &refs);
+        BUG_ON(ret);
+        if (refs > 1)
+                goto out;
+        while (*level >= 0) {
+                cur = path->nodes[*level];
+                if (*level == 0) {
+                        ret = btrfs_drop_leaf_ref(trans, root, cur);
+                        BUG_ON(ret);
+                        clean_tree_block(trans, root, cur);
+                        break;
+                }
+                if (path->slots[*level] >= btrfs_header_nritems(cur)) {
+                        clean_tree_block(trans, root, cur);
+                        break;
+                }
+                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+                blocksize = btrfs_level_size(root, *level - 1);
+                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+                btrfs_tree_lock(next);
+                ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+                                              &refs);
+                BUG_ON(ret);
+                if (refs > 1) {
+                        parent = path->nodes[*level];
+                        ret = btrfs_free_extent(trans, root, bytenr,
+                                        blocksize, parent->start,
+                                        btrfs_header_owner(parent),
+                                        btrfs_header_generation(parent),
+                                        *level - 1, 1);
+                        BUG_ON(ret);
+                        path->slots[*level]++;
+                        btrfs_tree_unlock(next);
+                        free_extent_buffer(next);
+                        continue;
+                }
+                *level = btrfs_header_level(next);
+                path->nodes[*level] = next;
+                path->slots[*level] = 0;
+                path->locks[*level] = 1;
+                cond_resched();
+        }
+out:
+        parent = path->nodes[*level + 1];
+        bytenr = path->nodes[*level]->start;
+        blocksize = path->nodes[*level]->len;
+        ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+                        parent->start, btrfs_header_owner(parent),
+                        btrfs_header_generation(parent), *level, 1);
+        BUG_ON(ret);
+        if (path->locks[*level]) {
+                btrfs_tree_unlock(path->nodes[*level]);
+                path->locks[*level] = 0;
+        }
+        free_extent_buffer(path->nodes[*level]);
+        path->nodes[*level] = NULL;
+        *level += 1;
+        cond_resched();
+        return 0;
+}
+/*
+ * helper for dropping snapshots.  This walks back up the tree in the path
+ * to find the first node higher up where we haven't yet gone through
+ * all the slots
+ */
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 int *level, int max_level)
+{
+        u64 root_owner;
+        u64 root_gen;
+        struct btrfs_root_item *root_item = &root->root_item;
+        int i;
+        int slot;
+        int ret;
+        for (i = *level; i < max_level && path->nodes[i]; i++) {
+                slot = path->slots[i];
+                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                        struct extent_buffer *node;
+                        struct btrfs_disk_key disk_key;
+                        node = path->nodes[i];
+                        path->slots[i]++;
+                        *level = i;
+                        WARN_ON(*level == 0);
+                        btrfs_node_key(node, &disk_key, path->slots[i]);
+                        memcpy(&root_item->drop_progress,
+                               &disk_key, sizeof(disk_key));
+                        root_item->drop_level = i;
+                        return 0;
+                } else {
+                        struct extent_buffer *parent;
+                        if (path->nodes[*level] == root->node)
+                                parent = path->nodes[*level];
+                        else
+                                parent = path->nodes[*level + 1];
+                        root_owner = btrfs_header_owner(parent);
+                        root_gen = btrfs_header_generation(parent);
+                        clean_tree_block(trans, root, path->nodes[*level]);
+                        ret = btrfs_free_extent(trans, root,
+                                                path->nodes[*level]->start,
+                                                path->nodes[*level]->len,
+                                                parent->start, root_owner,
+                                                root_gen, *level, 1);
+                        BUG_ON(ret);
+                        if (path->locks[*level]) {
+                                btrfs_tree_unlock(path->nodes[*level]);
+                                path->locks[*level] = 0;
+                        }
+                        free_extent_buffer(path->nodes[*level]);
+                        path->nodes[*level] = NULL;
+                        *level = i + 1;
+                }
+        }
+        return 1;
+}
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+                        *root)
+{
+        int ret = 0;
+        int wret;
+        int level;
+        struct btrfs_path *path;
+        int i;
+        int orig_level;
+        struct btrfs_root_item *root_item = &root->root_item;
+        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        level = btrfs_header_level(root->node);
+        orig_level = level;
+        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+                path->nodes[level] = root->node;
+                extent_buffer_get(root->node);
+                path->slots[level] = 0;
+        } else {
+                struct btrfs_key key;
+                struct btrfs_disk_key found_key;
+                struct extent_buffer *node;
+                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+                level = root_item->drop_level;
+                path->lowest_level = level;
+                wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (wret < 0) {
+                        ret = wret;
+                        goto out;
+                }
+                node = path->nodes[level];
+                btrfs_node_key(node, &found_key, path->slots[level]);
+                WARN_ON(memcmp(&found_key, &root_item->drop_progress,
+                               sizeof(found_key)));
+                /*
+                 * unlock our path, this is safe because only this
+                 * function is allowed to delete this snapshot
+                 */
+                for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+                        if (path->nodes[i] && path->locks[i]) {
+                                path->locks[i] = 0;
+                                btrfs_tree_unlock(path->nodes[i]);
+                        }
+                }
+        }
+        while (1) {
+                wret = walk_down_tree(trans, root, path, &level);
+                if (wret > 0)
+                        break;
+                if (wret < 0)
+                        ret = wret;
+                wret = walk_up_tree(trans, root, path, &level,
+                                    BTRFS_MAX_LEVEL);
+                if (wret > 0)
+                        break;
+                if (wret < 0)
+                        ret = wret;
+                if (trans->transaction->in_commit) {
+                        ret = -EAGAIN;
+                        break;
+                }
+                atomic_inc(&root->fs_info->throttle_gen);
+                wake_up(&root->fs_info->transaction_throttle);
+        }
+        for (i = 0; i <= orig_level; i++) {
+                if (path->nodes[i]) {
+                        free_extent_buffer(path->nodes[i]);
+                        path->nodes[i] = NULL;
+                }
+        }
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct extent_buffer *node,
+                        struct extent_buffer *parent)
+{
+        struct btrfs_path *path;
+        int level;
+        int parent_level;
+        int ret = 0;
+        int wret;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        BUG_ON(!btrfs_tree_locked(parent));
+        parent_level = btrfs_header_level(parent);
+        extent_buffer_get(parent);
+        path->nodes[parent_level] = parent;
+        path->slots[parent_level] = btrfs_header_nritems(parent);
+        BUG_ON(!btrfs_tree_locked(node));
+        level = btrfs_header_level(node);
+        extent_buffer_get(node);
+        path->nodes[level] = node;
+        path->slots[level] = 0;
+        while (1) {
+                wret = walk_down_subtree(trans, root, path, &level);
+                if (wret < 0)
+                        ret = wret;
+                if (wret != 0)
+                        break;
+                wret = walk_up_tree(trans, root, path, &level, parent_level);
+                if (wret < 0)
+                        ret = wret;
+                if (wret != 0)
+                        break;
+        }
+        btrfs_free_path(path);
+        return ret;
+}
+static unsigned long calc_ra(unsigned long start, unsigned long last,
+                             unsigned long nr)
+{
+        return min(last, start + nr - 1);
+}
+static noinline int relocate_inode_pages(struct inode *inode, u64 start,
+                                         u64 len)
+{
+        u64 page_start;
+        u64 page_end;
+        unsigned long first_index;
+        unsigned long last_index;
+        unsigned long i;
+        struct page *page;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct file_ra_state *ra;
+        struct btrfs_ordered_extent *ordered;
+        unsigned int total_read = 0;
+        unsigned int total_dirty = 0;
+        int ret = 0;
+        ra = kzalloc(sizeof(*ra), GFP_NOFS);
+        mutex_lock(&inode->i_mutex);
+        first_index = start >> PAGE_CACHE_SHIFT;
+        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+        /* make sure the dirty trick played by the caller work */
+        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                            first_index, last_index);
+        if (ret)
+                goto out_unlock;
+        file_ra_state_init(ra, inode->i_mapping);
+        for (i = first_index ; i <= last_index; i++) {
+                if (total_read % ra->ra_pages == 0) {
+                        btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+                                       calc_ra(i, last_index, ra->ra_pages));
+                }
+                total_read++;
+again:
+                if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+                        BUG_ON(1);
+                page = grab_cache_page(inode->i_mapping, i);
+                if (!page) {
+                        ret = -ENOMEM;
+                        goto out_unlock;
+                }
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                ret = -EIO;
+                                goto out_unlock;
+                        }
+                }
+                wait_on_page_writeback(page);
+                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+                page_end = page_start + PAGE_CACHE_SIZE - 1;
+                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, page_start);
+                if (ordered) {
+                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                        goto again;
+                }
+                set_page_extent_mapped(page);
+                if (i == first_index)
+                        set_extent_bits(io_tree, page_start, page_end,
+                                        EXTENT_BOUNDARY, GFP_NOFS);
+                btrfs_set_extent_delalloc(inode, page_start, page_end);
+                set_page_dirty(page);
+                total_dirty++;
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_page(page);
+                page_cache_release(page);
+        }
+out_unlock:
+        kfree(ra);
+        mutex_unlock(&inode->i_mutex);
+        balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+        return ret;
+}
+static noinline int relocate_data_extent(struct inode *reloc_inode,
+                                         struct btrfs_key *extent_key,
+                                         u64 offset)
+{
+        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+        struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
+        struct extent_map *em;
+        u64 start = extent_key->objectid - offset;
+        u64 end = start + extent_key->offset - 1;
+        em = alloc_extent_map(GFP_NOFS);
+        BUG_ON(!em || IS_ERR(em));
+        em->start = start;
+        em->len = extent_key->offset;
+        em->block_len = extent_key->offset;
+        em->block_start = extent_key->objectid;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        /* setup extent map to cheat btrfs_readpage */
+        lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+        while (1) {
+                int ret;
+                spin_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                spin_unlock(&em_tree->lock);
+                if (ret != -EEXIST) {
+                        free_extent_map(em);
+                        break;
+                }
+                btrfs_drop_extent_cache(reloc_inode, start, end, 0);
+        }
+        unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+        return relocate_inode_pages(reloc_inode, start, extent_key->offset);
+}
+struct btrfs_ref_path {
+        u64 extent_start;
+        u64 nodes[BTRFS_MAX_LEVEL];
+        u64 root_objectid;
+        u64 root_generation;
+        u64 owner_objectid;
+        u32 num_refs;
+        int lowest_level;
+        int current_level;
+        int shared_level;
+        struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
+        u64 new_nodes[BTRFS_MAX_LEVEL];
+};
+struct disk_extent {
+        u64 ram_bytes;
+        u64 disk_bytenr;
+        u64 disk_num_bytes;
+        u64 offset;
+        u64 num_bytes;
+        u8 compression;
+        u8 encryption;
+        u16 other_encoding;
+};
+static int is_cowonly_root(u64 root_objectid)
+{
+        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+                return 1;
+        return 0;
+}
+static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *extent_root,
+                                    struct btrfs_ref_path *ref_path,
+                                    int first_time)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_path *path;
+        struct btrfs_extent_ref *ref;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        u64 bytenr;
+        u32 nritems;
+        int level;
+        int ret = 1;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        if (first_time) {
+                ref_path->lowest_level = -1;
+                ref_path->current_level = -1;
+                ref_path->shared_level = -1;
+                goto walk_up;
+        }
+walk_down:
+        level = ref_path->current_level - 1;
+        while (level >= -1) {
+                u64 parent;
+                if (level < ref_path->lowest_level)
+                        break;
+                if (level >= 0)
+                        bytenr = ref_path->nodes[level];
+                else
+                        bytenr = ref_path->extent_start;
+                BUG_ON(bytenr == 0);
+                parent = ref_path->nodes[level + 1];
+                ref_path->nodes[level + 1] = 0;
+                ref_path->current_level = level;
+                BUG_ON(parent == 0);
+                key.objectid = bytenr;
+                key.offset = parent + 1;
+                key.type = BTRFS_EXTENT_REF_KEY;
+                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                BUG_ON(ret == 0);
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(extent_root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0)
+                                goto next;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid == bytenr &&
+                    found_key.type == BTRFS_EXTENT_REF_KEY) {
+                        if (level < ref_path->shared_level)
+                                ref_path->shared_level = level;
+                        goto found;
+                }
+next:
+                level--;
+                btrfs_release_path(extent_root, path);
+                cond_resched();
+        }
+        /* reached lowest level */
+        ret = 1;
+        goto out;
+walk_up:
+        level = ref_path->current_level;
+        while (level < BTRFS_MAX_LEVEL - 1) {
+                u64 ref_objectid;
+                if (level >= 0)
+                        bytenr = ref_path->nodes[level];
+                else
+                        bytenr = ref_path->extent_start;
+                BUG_ON(bytenr == 0);
+                key.objectid = bytenr;
+                key.offset = 0;
+                key.type = BTRFS_EXTENT_REF_KEY;
+                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(extent_root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0) {
+                                /* the extent was freed by someone */
+                                if (ref_path->lowest_level == level)
+                                        goto out;
+                                btrfs_release_path(extent_root, path);
+                                goto walk_down;
+                        }
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid != bytenr ||
+                                found_key.type != BTRFS_EXTENT_REF_KEY) {
+                        /* the extent was freed by someone */
+                        if (ref_path->lowest_level == level) {
+                                ret = 1;
+                                goto out;
+                        }
+                        btrfs_release_path(extent_root, path);
+                        goto walk_down;
+                }
+found:
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_extent_ref);
+                ref_objectid = btrfs_ref_objectid(leaf, ref);
+                if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                        if (first_time) {
+                                level = (int)ref_objectid;
+                                BUG_ON(level >= BTRFS_MAX_LEVEL);
+                                ref_path->lowest_level = level;
+                                ref_path->current_level = level;
+                                ref_path->nodes[level] = bytenr;
+                        } else {
+                                WARN_ON(ref_objectid != level);
+                        }
+                } else {
+                        WARN_ON(level != -1);
+                }
+                first_time = 0;
+                if (ref_path->lowest_level == level) {
+                        ref_path->owner_objectid = ref_objectid;
+                        ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
+                }
+                /*
+                 * the block is tree root or the block isn't in reference
+                 * counted tree.
+                 */
+                if (found_key.objectid == found_key.offset ||
+                    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
+                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+                        ref_path->root_generation =
+                                btrfs_ref_generation(leaf, ref);
+                        if (level < 0) {
+                                /* special reference from the tree log */
+                                ref_path->nodes[0] = found_key.offset;
+                                ref_path->current_level = 0;
+                        }
+                        ret = 0;
+                        goto out;
+                }
+                level++;
+                BUG_ON(ref_path->nodes[level] != 0);
+                ref_path->nodes[level] = found_key.offset;
+                ref_path->current_level = level;
+                /*
+                 * the reference was created in the running transaction,
+                 * no need to continue walking up.
+                 */
+                if (btrfs_ref_generation(leaf, ref) == trans->transid) {
+                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+                        ref_path->root_generation =
+                                btrfs_ref_generation(leaf, ref);
+                        ret = 0;
+                        goto out;
+                }
+                btrfs_release_path(extent_root, path);
+                cond_resched();
+        }
+        /* reached max tree level, but no tree root found. */
+        BUG();
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *extent_root,
+                                struct btrfs_ref_path *ref_path,
+                                u64 extent_start)
+{
+        memset(ref_path, 0, sizeof(*ref_path));
+        ref_path->extent_start = extent_start;
+        return __next_ref_path(trans, extent_root, ref_path, 1);
+}
+static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root,
+                               struct btrfs_ref_path *ref_path)
+{
+        return __next_ref_path(trans, extent_root, ref_path, 0);
+}
+static noinline int get_new_locations(struct inode *reloc_inode,
+                                      struct btrfs_key *extent_key,
+                                      u64 offset, int no_fragment,
+                                      struct disk_extent **extents,
+                                      int *nr_extents)
+{
+        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        struct disk_extent *exts = *extents;
+        struct btrfs_key found_key;
+        u64 cur_pos;
+        u64 last_byte;
+        u32 nritems;
+        int nr = 0;
+        int max = *nr_extents;
+        int ret;
+        WARN_ON(!no_fragment && *extents);
+        if (!exts) {
+                max = 1;
+                exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
+                if (!exts)
+                        return -ENOMEM;
+        }
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        cur_pos = extent_key->objectid - offset;
+        last_byte = extent_key->objectid + extent_key->offset;
+        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+                                       cur_pos, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0)
+                                break;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.offset != cur_pos ||
+                    found_key.type != BTRFS_EXTENT_DATA_KEY ||
+                    found_key.objectid != reloc_inode->i_ino)
+                        break;
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) !=
+                    BTRFS_FILE_EXTENT_REG ||
+                    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+                        break;
+                if (nr == max) {
+                        struct disk_extent *old = exts;
+                        max *= 2;
+                        exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
+                        memcpy(exts, old, sizeof(*exts) * nr);
+                        if (old != *extents)
+                                kfree(old);
+                }
+                exts[nr].disk_bytenr =
+                        btrfs_file_extent_disk_bytenr(leaf, fi);
+                exts[nr].disk_num_bytes =
+                        btrfs_file_extent_disk_num_bytes(leaf, fi);
+                exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
+                exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+                exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+                exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+                exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+                exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                           fi);
+                BUG_ON(exts[nr].offset > 0);
+                BUG_ON(exts[nr].compression || exts[nr].encryption);
+                BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
+                cur_pos += exts[nr].num_bytes;
+                nr++;
+                if (cur_pos + offset >= last_byte)
+                        break;
+                if (no_fragment) {
+                        ret = 1;
+                        goto out;
+                }
+                path->slots[0]++;
+        }
+        BUG_ON(cur_pos + offset > last_byte);
+        if (cur_pos + offset < last_byte) {
+                ret = -ENOENT;
+                goto out;
+        }
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        if (ret) {
+                if (exts != *extents)
+                        kfree(exts);
+        } else {
+                *extents = exts;
+                *nr_extents = nr;
+        }
+        return ret;
+}
+static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_path *path,
+                                        struct btrfs_key *extent_key,
+                                        struct btrfs_key *leaf_key,
+                                        struct btrfs_ref_path *ref_path,
+                                        struct disk_extent *new_extents,
+                                        int nr_extents)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *fi;
+        struct inode *inode = NULL;
+        struct btrfs_key key;
+        u64 lock_start = 0;
+        u64 lock_end = 0;
+        u64 num_bytes;
+        u64 ext_offset;
+        u64 first_pos;
+        u32 nritems;
+        int nr_scaned = 0;
+        int extent_locked = 0;
+        int extent_type;
+        int ret;
+        memcpy(&key, leaf_key, sizeof(key));
+        first_pos = INT_LIMIT(loff_t) - extent_key->offset;
+        if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+                if (key.objectid < ref_path->owner_objectid ||
+                    (key.objectid == ref_path->owner_objectid &&
+                     key.type < BTRFS_EXTENT_DATA_KEY)) {
+                        key.objectid = ref_path->owner_objectid;
+                        key.type = BTRFS_EXTENT_DATA_KEY;
+                        key.offset = 0;
+                }
+        }
+        while (1) {
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                if (ret < 0)
+                        goto out;
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+next:
+                if (extent_locked && ret > 0) {
+                        /*
+                         * the file extent item was modified by someone
+                         * before the extent got locked.
+                         */
+                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+                                      lock_end, GFP_NOFS);
+                        extent_locked = 0;
+                }
+                if (path->slots[0] >= nritems) {
+                        if (++nr_scaned > 2)
+                                break;
+                        BUG_ON(extent_locked);
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0)
+                                break;
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+                        if ((key.objectid > ref_path->owner_objectid) ||
+                            (key.objectid == ref_path->owner_objectid &&
+                             key.type > BTRFS_EXTENT_DATA_KEY) ||
+                            (key.offset >= first_pos + extent_key->offset))
+                                break;
+                }
+                if (inode && key.objectid != inode->i_ino) {
+                        BUG_ON(extent_locked);
+                        btrfs_release_path(root, path);
+                        mutex_unlock(&inode->i_mutex);
+                        iput(inode);
+                        inode = NULL;
+                        continue;
+                }
+                if (key.type != BTRFS_EXTENT_DATA_KEY) {
+                        path->slots[0]++;
+                        ret = 1;
+                        goto next;
+                }
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                extent_type = btrfs_file_extent_type(leaf, fi);
+                if ((extent_type != BTRFS_FILE_EXTENT_REG &&
+                     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
+                    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+                     extent_key->objectid)) {
+                        path->slots[0]++;
+                        ret = 1;
+                        goto next;
+                }
+                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+                ext_offset = btrfs_file_extent_offset(leaf, fi);
+                if (first_pos > key.offset - ext_offset)
+                        first_pos = key.offset - ext_offset;
+                if (!extent_locked) {
+                        lock_start = key.offset;
+                        lock_end = lock_start + num_bytes - 1;
+                } else {
+                        if (lock_start > key.offset ||
+                            lock_end + 1 < key.offset + num_bytes) {
+                                unlock_extent(&BTRFS_I(inode)->io_tree,
+                                              lock_start, lock_end, GFP_NOFS);
+                                extent_locked = 0;
+                        }
+                }
+                if (!inode) {
+                        btrfs_release_path(root, path);
+                        inode = btrfs_iget_locked(root->fs_info->sb,
+                                                  key.objectid, root);
+                        if (inode->i_state & I_NEW) {
+                                BTRFS_I(inode)->root = root;
+                                BTRFS_I(inode)->location.objectid =
+                                        key.objectid;
+                                BTRFS_I(inode)->location.type =
+                                        BTRFS_INODE_ITEM_KEY;
+                                BTRFS_I(inode)->location.offset = 0;
+                                btrfs_read_locked_inode(inode);
+                                unlock_new_inode(inode);
+                        }
+                        /*
+                         * some code call btrfs_commit_transaction while
+                         * holding the i_mutex, so we can't use mutex_lock
+                         * here.
+                         */
+                        if (is_bad_inode(inode) ||
+                            !mutex_trylock(&inode->i_mutex)) {
+                                iput(inode);
+                                inode = NULL;
+                                key.offset = (u64)-1;
+                                goto skip;
+                        }
+                }
+                if (!extent_locked) {
+                        struct btrfs_ordered_extent *ordered;
+                        btrfs_release_path(root, path);
+                        lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+                                    lock_end, GFP_NOFS);
+                        ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                                    lock_end);
+                        if (ordered &&
+                            ordered->file_offset <= lock_end &&
+                            ordered->file_offset + ordered->len > lock_start) {
+                                unlock_extent(&BTRFS_I(inode)->io_tree,
+                                              lock_start, lock_end, GFP_NOFS);
+                                btrfs_start_ordered_extent(inode, ordered, 1);
+                                btrfs_put_ordered_extent(ordered);
+                                key.offset += num_bytes;
+                                goto skip;
+                        }
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        extent_locked = 1;
+                        continue;
+                }
+                if (nr_extents == 1) {
+                        /* update extent pointer in place */
+                        btrfs_set_file_extent_disk_bytenr(leaf, fi,
+                                                new_extents[0].disk_bytenr);
+                        btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+                                                new_extents[0].disk_num_bytes);
+                        btrfs_mark_buffer_dirty(leaf);
+                        btrfs_drop_extent_cache(inode, key.offset,
+                                                key.offset + num_bytes - 1, 0);
+                        ret = btrfs_inc_extent_ref(trans, root,
+                                                new_extents[0].disk_bytenr,
+                                                new_extents[0].disk_num_bytes,
+                                                leaf->start,
+                                                root->root_key.objectid,
+                                                trans->transid,
+                                                key.objectid);
+                        BUG_ON(ret);
+                        ret = btrfs_free_extent(trans, root,
+                                                extent_key->objectid,
+                                                extent_key->offset,
+                                                leaf->start,
+                                                btrfs_header_owner(leaf),
+                                                btrfs_header_generation(leaf),
+                                                key.objectid, 0);
+                        BUG_ON(ret);
+                        btrfs_release_path(root, path);
+                        key.offset += num_bytes;
+                } else {
+                        BUG_ON(1);
+#if 0
+                        u64 alloc_hint;
+                        u64 extent_len;
+                        int i;
+                        /*
+                         * drop old extent pointer at first, then insert the
+                         * new pointers one bye one
+                         */
+                        btrfs_release_path(root, path);
+                        ret = btrfs_drop_extents(trans, root, inode, key.offset,
+                                                 key.offset + num_bytes,
+                                                 key.offset, &alloc_hint);
+                        BUG_ON(ret);
+                        for (i = 0; i < nr_extents; i++) {
+                                if (ext_offset >= new_extents[i].num_bytes) {
+                                        ext_offset -= new_extents[i].num_bytes;
+                                        continue;
+                                }
+                                extent_len = min(new_extents[i].num_bytes -
+                                                 ext_offset, num_bytes);
+                                ret = btrfs_insert_empty_item(trans, root,
+                                                              path, &key,
+                                                              sizeof(*fi));
+                                BUG_ON(ret);
+                                leaf = path->nodes[0];
+                                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                                struct btrfs_file_extent_item);
+                                btrfs_set_file_extent_generation(leaf, fi,
+                                                        trans->transid);
+                                btrfs_set_file_extent_type(leaf, fi,
+                                                        BTRFS_FILE_EXTENT_REG);
+                                btrfs_set_file_extent_disk_bytenr(leaf, fi,
+                                                new_extents[i].disk_bytenr);
+                                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+                                                new_extents[i].disk_num_bytes);
+                                btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                                new_extents[i].ram_bytes);
+                                btrfs_set_file_extent_compression(leaf, fi,
+                                                new_extents[i].compression);
+                                btrfs_set_file_extent_encryption(leaf, fi,
+                                                new_extents[i].encryption);
+                                btrfs_set_file_extent_other_encoding(leaf, fi,
+                                                new_extents[i].other_encoding);
+                                btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        extent_len);
+                                ext_offset += new_extents[i].offset;
+                                btrfs_set_file_extent_offset(leaf, fi,
+                                                        ext_offset);
+                                btrfs_mark_buffer_dirty(leaf);
+                                btrfs_drop_extent_cache(inode, key.offset,
+                                                key.offset + extent_len - 1, 0);
+                                ret = btrfs_inc_extent_ref(trans, root,
+                                                new_extents[i].disk_bytenr,
+                                                new_extents[i].disk_num_bytes,
+                                                leaf->start,
+                                                root->root_key.objectid,
+                                                trans->transid, key.objectid);
+                                BUG_ON(ret);
+                                btrfs_release_path(root, path);
+                                inode_add_bytes(inode, extent_len);
+                                ext_offset = 0;
+                                num_bytes -= extent_len;
+                                key.offset += extent_len;
+                                if (num_bytes == 0)
+                                        break;
+                        }
+                        BUG_ON(i >= nr_extents);
+#endif
+                }
+                if (extent_locked) {
+                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+                                      lock_end, GFP_NOFS);
+                        extent_locked = 0;
+                }
+skip:
+                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
+                    key.offset >= first_pos + extent_key->offset)
+                        break;
+                cond_resched();
+        }
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        if (inode) {
+                mutex_unlock(&inode->i_mutex);
+                if (extent_locked) {
+                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+                                      lock_end, GFP_NOFS);
+                }
+                iput(inode);
+        }
+        return ret;
+}
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct extent_buffer *buf, u64 orig_start)
+{
+        int level;
+        int ret;
+        BUG_ON(btrfs_header_generation(buf) != trans->transid);
+        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+        level = btrfs_header_level(buf);
+        if (level == 0) {
+                struct btrfs_leaf_ref *ref;
+                struct btrfs_leaf_ref *orig_ref;
+                orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
+                if (!orig_ref)
+                        return -ENOENT;
+                ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
+                if (!ref) {
+                        btrfs_free_leaf_ref(root, orig_ref);
+                        return -ENOMEM;
+                }
+                ref->nritems = orig_ref->nritems;
+                memcpy(ref->extents, orig_ref->extents,
+                        sizeof(ref->extents[0]) * ref->nritems);
+                btrfs_free_leaf_ref(root, orig_ref);
+                ref->root_gen = trans->transid;
+                ref->bytenr = buf->start;
+                ref->owner = btrfs_header_owner(buf);
+                ref->generation = btrfs_header_generation(buf);
+                ret = btrfs_add_leaf_ref(root, ref, 0);
+                WARN_ON(ret);
+                btrfs_free_leaf_ref(root, ref);
+        }
+        return 0;
+}
+static noinline int invalidate_extent_cache(struct btrfs_root *root,
+                                        struct extent_buffer *leaf,
+                                        struct btrfs_block_group_cache *group,
+                                        struct btrfs_root *target_root)
+{
+        struct btrfs_key key;
+        struct inode *inode = NULL;
+        struct btrfs_file_extent_item *fi;
+        u64 num_bytes;
+        u64 skip_objectid = 0;
+        u32 nritems;
+        u32 i;
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                btrfs_item_key_to_cpu(leaf, &key, i);
+                if (key.objectid == skip_objectid ||
+                    key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+                        continue;
+                if (!inode || inode->i_ino != key.objectid) {
+                        iput(inode);
+                        inode = btrfs_ilookup(target_root->fs_info->sb,
+                                              key.objectid, target_root, 1);
+                }
+                if (!inode) {
+                        skip_objectid = key.objectid;
+                        continue;
+                }
+                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+                lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+                            key.offset + num_bytes - 1, GFP_NOFS);
+                btrfs_drop_extent_cache(inode, key.offset,
+                                        key.offset + num_bytes - 1, 1);
+                unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+                              key.offset + num_bytes - 1, GFP_NOFS);
+                cond_resched();
+        }
+        iput(inode);
+        return 0;
+}
+static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct extent_buffer *leaf,
+                                        struct btrfs_block_group_cache *group,
+                                        struct inode *reloc_inode)
+{
+        struct btrfs_key key;
+        struct btrfs_key extent_key;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_leaf_ref *ref;
+        struct disk_extent *new_extent;
+        u64 bytenr;
+        u64 num_bytes;
+        u32 nritems;
+        u32 i;
+        int ext_index;
+        int nr_extent;
+        int ret;
+        new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
+        BUG_ON(!new_extent);
+        ref = btrfs_lookup_leaf_ref(root, leaf->start);
+        BUG_ON(!ref);
+        ext_index = -1;
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                btrfs_item_key_to_cpu(leaf, &key, i);
+                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+                if (bytenr == 0)
+                        continue;
+                ext_index++;
+                if (bytenr >= group->key.objectid + group->key.offset ||
+                    bytenr + num_bytes <= group->key.objectid)
+                        continue;
+                extent_key.objectid = bytenr;
+                extent_key.offset = num_bytes;
+                extent_key.type = BTRFS_EXTENT_ITEM_KEY;
+                nr_extent = 1;
+                ret = get_new_locations(reloc_inode, &extent_key,
+                                        group->key.objectid, 1,
+                                        &new_extent, &nr_extent);
+                if (ret > 0)
+                        continue;
+                BUG_ON(ret < 0);
+                BUG_ON(ref->extents[ext_index].bytenr != bytenr);
+                BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
+                ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
+                ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
+                btrfs_set_file_extent_disk_bytenr(leaf, fi,
+                                                new_extent->disk_bytenr);
+                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+                                                new_extent->disk_num_bytes);
+                btrfs_mark_buffer_dirty(leaf);
+                ret = btrfs_inc_extent_ref(trans, root,
+                                        new_extent->disk_bytenr,
+                                        new_extent->disk_num_bytes,
+                                        leaf->start,
+                                        root->root_key.objectid,
+                                        trans->transid, key.objectid);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, root,
+                                        bytenr, num_bytes, leaf->start,
+                                        btrfs_header_owner(leaf),
+                                        btrfs_header_generation(leaf),
+                                        key.objectid, 0);
+                BUG_ON(ret);
+                cond_resched();
+        }
+        kfree(new_extent);
+        BUG_ON(ext_index + 1 != ref->nritems);
+        btrfs_free_leaf_ref(root, ref);
+        return 0;
+}
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        int ret;
+        if (root->reloc_root) {
+                reloc_root = root->reloc_root;
+                root->reloc_root = NULL;
+                list_add(&reloc_root->dead_list,
+                         &root->fs_info->dead_reloc_roots);
+                btrfs_set_root_bytenr(&reloc_root->root_item,
+                                      reloc_root->node->start);
+                btrfs_set_root_level(&root->root_item,
+                                     btrfs_header_level(reloc_root->node));
+                memset(&reloc_root->root_item.drop_progress, 0,
+                        sizeof(struct btrfs_disk_key));
+                reloc_root->root_item.drop_level = 0;
+                ret = btrfs_update_root(trans, root->fs_info->tree_root,
+                                        &reloc_root->root_key,
+                                        &reloc_root->root_item);
+                BUG_ON(ret);
+        }
+        return 0;
+}
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root *prev_root = NULL;
+        struct list_head dead_roots;
+        int ret;
+        unsigned long nr;
+        INIT_LIST_HEAD(&dead_roots);
+        list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
+        while (!list_empty(&dead_roots)) {
+                reloc_root = list_entry(dead_roots.prev,
+                                        struct btrfs_root, dead_list);
+                list_del_init(&reloc_root->dead_list);
+                BUG_ON(reloc_root->commit_root != NULL);
+                while (1) {
+                        trans = btrfs_join_transaction(root, 1);
+                        BUG_ON(!trans);
+                        mutex_lock(&root->fs_info->drop_mutex);
+                        ret = btrfs_drop_snapshot(trans, reloc_root);
+                        if (ret != -EAGAIN)
+                                break;
+                        mutex_unlock(&root->fs_info->drop_mutex);
+                        nr = trans->blocks_used;
+                        ret = btrfs_end_transaction(trans, root);
+                        BUG_ON(ret);
+                        btrfs_btree_balance_dirty(root, nr);
+                }
+                free_extent_buffer(reloc_root->node);
+                ret = btrfs_del_root(trans, root->fs_info->tree_root,
+                                     &reloc_root->root_key);
+                BUG_ON(ret);
+                mutex_unlock(&root->fs_info->drop_mutex);
+                nr = trans->blocks_used;
+                ret = btrfs_end_transaction(trans, root);
+                BUG_ON(ret);
+                btrfs_btree_balance_dirty(root, nr);
+                kfree(prev_root);
+                prev_root = reloc_root;
+        }
+        if (prev_root) {
+                btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
+                kfree(prev_root);
+        }
+        return 0;
+}
+int btrfs_add_dead_reloc_root(struct btrfs_root *root)
+{
+        list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
+        return 0;
+}
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key location;
+        int found;
+        int ret;
+        mutex_lock(&root->fs_info->tree_reloc_mutex);
+        ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
+        BUG_ON(ret);
+        found = !list_empty(&root->fs_info->dead_reloc_roots);
+        mutex_unlock(&root->fs_info->tree_reloc_mutex);
+        if (found) {
+                trans = btrfs_start_transaction(root, 1);
+                BUG_ON(!trans);
+                ret = btrfs_commit_transaction(trans, root);
+                BUG_ON(ret);
+        }
+        location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+        location.offset = (u64)-1;
+        location.type = BTRFS_ROOT_ITEM_KEY;
+        reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+        BUG_ON(!reloc_root);
+        btrfs_orphan_cleanup(reloc_root);
+        return 0;
+}
+static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct extent_buffer *eb;
+        struct btrfs_root_item *root_item;
+        struct btrfs_key root_key;
+        int ret;
+        BUG_ON(!root->ref_cows);
+        if (root->reloc_root)
+                return 0;
+        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+        BUG_ON(!root_item);
+        ret = btrfs_copy_root(trans, root, root->commit_root,
+                              &eb, BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(ret);
+        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+        root_key.offset = root->root_key.objectid;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        memcpy(root_item, &root->root_item, sizeof(root_item));
+        btrfs_set_root_refs(root_item, 0);
+        btrfs_set_root_bytenr(root_item, eb->start);
+        btrfs_set_root_level(root_item, btrfs_header_level(eb));
+        btrfs_set_root_generation(root_item, trans->transid);
+        btrfs_tree_unlock(eb);
+        free_extent_buffer(eb);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+                                &root_key, root_item);
+        BUG_ON(ret);
+        kfree(root_item);
+        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+                                                 &root_key);
+        BUG_ON(!reloc_root);
+        reloc_root->last_trans = trans->transid;
+        reloc_root->commit_root = NULL;
+        reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
+        root->reloc_root = reloc_root;
+        return 0;
+}
+/*
+ * Core function of space balance.
+ *
+ * The idea is using reloc trees to relocate tree blocks in reference
+ * counted roots. There is one reloc tree for each subvol, and all
+ * reloc trees share same root key objectid. Reloc trees are snapshots
+ * of the latest committed roots of subvols (root->commit_root).
+ *
+ * To relocate a tree block referenced by a subvol, there are two steps.
+ * COW the block through subvol's reloc tree, then update block pointer
+ * in the subvol to point to the new block. Since all reloc trees share
+ * same root key objectid, doing special handing for tree blocks owned
+ * by them is easy. Once a tree block has been COWed in one reloc tree,
+ * we can use the resulting new block directly when the same block is
+ * required to COW again through other reloc trees. By this way, relocated
+ * tree blocks are shared between reloc trees, so they are also shared
+ * between subvols.
+ */
+static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct btrfs_key *first_key,
+                                      struct btrfs_ref_path *ref_path,
+                                      struct btrfs_block_group_cache *group,
+                                      struct inode *reloc_inode)
+{
+        struct btrfs_root *reloc_root;
+        struct extent_buffer *eb = NULL;
+        struct btrfs_key *keys;
+        u64 *nodes;
+        int level;
+        int shared_level;
+        int lowest_level = 0;
+        int ret;
+        if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+                lowest_level = ref_path->owner_objectid;
+        if (!root->ref_cows) {
+                path->lowest_level = lowest_level;
+                ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
+                BUG_ON(ret < 0);
+                path->lowest_level = 0;
+                btrfs_release_path(root, path);
+                return 0;
+        }
+        mutex_lock(&root->fs_info->tree_reloc_mutex);
+        ret = init_reloc_tree(trans, root);
+        BUG_ON(ret);
+        reloc_root = root->reloc_root;
+        shared_level = ref_path->shared_level;
+        ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
+        keys = ref_path->node_keys;
+        nodes = ref_path->new_nodes;
+        memset(&keys[shared_level + 1], 0,
+               sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
+        memset(&nodes[shared_level + 1], 0,
+               sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
+        if (nodes[lowest_level] == 0) {
+                path->lowest_level = lowest_level;
+                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+                                        0, 1);
+                BUG_ON(ret);
+                for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
+                        eb = path->nodes[level];
+                        if (!eb || eb == reloc_root->node)
+                                break;
+                        nodes[level] = eb->start;
+                        if (level == 0)
+                                btrfs_item_key_to_cpu(eb, &keys[level], 0);
+                        else
+                                btrfs_node_key_to_cpu(eb, &keys[level], 0);
+                }
+                if (nodes[0] &&
+                    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                        eb = path->nodes[0];
+                        ret = replace_extents_in_leaf(trans, reloc_root, eb,
+                                                      group, reloc_inode);
+                        BUG_ON(ret);
+                }
+                btrfs_release_path(reloc_root, path);
+        } else {
+                ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
+                                       lowest_level);
+                BUG_ON(ret);
+        }
+        /*
+         * replace tree blocks in the fs tree with tree blocks in
+         * the reloc tree.
+         */
+        ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
+        BUG_ON(ret < 0);
+        if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+                                        0, 0);
+                BUG_ON(ret);
+                extent_buffer_get(path->nodes[0]);
+                eb = path->nodes[0];
+                btrfs_release_path(reloc_root, path);
+                ret = invalidate_extent_cache(reloc_root, eb, group, root);
+                BUG_ON(ret);
+                free_extent_buffer(eb);
+        }
+        mutex_unlock(&root->fs_info->tree_reloc_mutex);
+        path->lowest_level = 0;
+        return 0;
+}
+static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_path *path,
+                                        struct btrfs_key *first_key,
+                                        struct btrfs_ref_path *ref_path)
+{
+        int ret;
+        ret = relocate_one_path(trans, root, path, first_key,
+                                ref_path, NULL, NULL);
+        BUG_ON(ret);
+        if (root == root->fs_info->extent_root)
+                btrfs_extent_post_op(trans, root);
+        return 0;
+}
+static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *extent_root,
+                                    struct btrfs_path *path,
+                                    struct btrfs_key *extent_key)
+{
+        int ret;
+        ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+        if (ret)
+                goto out;
+        ret = btrfs_del_item(trans, extent_root, path);
+out:
+        btrfs_release_path(extent_root, path);
+        return ret;
+}
+static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
+                                                struct btrfs_ref_path *ref_path)
+{
+        struct btrfs_key root_key;
+        root_key.objectid = ref_path->root_objectid;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        if (is_cowonly_root(ref_path->root_objectid))
+                root_key.offset = 0;
+        else
+                root_key.offset = (u64)-1;
+        return btrfs_read_fs_root_no_name(fs_info, &root_key);
+}
+static noinline int relocate_one_extent(struct btrfs_root *extent_root,
+                                        struct btrfs_path *path,
+                                        struct btrfs_key *extent_key,
+                                        struct btrfs_block_group_cache *group,
+                                        struct inode *reloc_inode, int pass)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *found_root;
+        struct btrfs_ref_path *ref_path = NULL;
+        struct disk_extent *new_extents = NULL;
+        int nr_extents = 0;
+        int loops;
+        int ret;
+        int level;
+        struct btrfs_key first_key;
+        u64 prev_block = 0;
+        trans = btrfs_start_transaction(extent_root, 1);
+        BUG_ON(!trans);
+        if (extent_key->objectid == 0) {
+                ret = del_extent_zero(trans, extent_root, path, extent_key);
+                goto out;
+        }
+        ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
+        if (!ref_path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        for (loops = 0; ; loops++) {
+                if (loops == 0) {
+                        ret = btrfs_first_ref_path(trans, extent_root, ref_path,
+                                                   extent_key->objectid);
+                } else {
+                        ret = btrfs_next_ref_path(trans, extent_root, ref_path);
+                }
+                if (ret < 0)
+                        goto out;
+                if (ret > 0)
+                        break;
+                if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+                    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+                        continue;
+                found_root = read_ref_root(extent_root->fs_info, ref_path);
+                BUG_ON(!found_root);
+                /*
+                 * for reference counted tree, only process reference paths
+                 * rooted at the latest committed root.
+                 */
+                if (found_root->ref_cows &&
+                    ref_path->root_generation != found_root->root_key.offset)
+                        continue;
+                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                        if (pass == 0) {
+                                /*
+                                 * copy data extents to new locations
+                                 */
+                                u64 group_start = group->key.objectid;
+                                ret = relocate_data_extent(reloc_inode,
+                                                           extent_key,
+                                                           group_start);
+                                if (ret < 0)
+                                        goto out;
+                                break;
+                        }
+                        level = 0;
+                } else {
+                        level = ref_path->owner_objectid;
+                }
+                if (prev_block != ref_path->nodes[level]) {
+                        struct extent_buffer *eb;
+                        u64 block_start = ref_path->nodes[level];
+                        u64 block_size = btrfs_level_size(found_root, level);
+                        eb = read_tree_block(found_root, block_start,
+                                             block_size, 0);
+                        btrfs_tree_lock(eb);
+                        BUG_ON(level != btrfs_header_level(eb));
+                        if (level == 0)
+                                btrfs_item_key_to_cpu(eb, &first_key, 0);
+                        else
+                                btrfs_node_key_to_cpu(eb, &first_key, 0);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
+                        prev_block = block_start;
+                }
+                btrfs_record_root_in_trans(found_root);
+                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                        /*
+                         * try to update data extent references while
+                         * keeping metadata shared between snapshots.
+                         */
+                        if (pass == 1) {
+                                ret = relocate_one_path(trans, found_root,
+                                                path, &first_key, ref_path,
+                                                group, reloc_inode);
+                                if (ret < 0)
+                                        goto out;
+                                continue;
+                        }
+                        /*
+                         * use fallback method to process the remaining
+                         * references.
+                         */
+                        if (!new_extents) {
+                                u64 group_start = group->key.objectid;
+                                new_extents = kmalloc(sizeof(*new_extents),
+                                                      GFP_NOFS);
+                                nr_extents = 1;
+                                ret = get_new_locations(reloc_inode,
+                                                        extent_key,
+                                                        group_start, 1,
+                                                        &new_extents,
+                                                        &nr_extents);
+                                if (ret)
+                                        goto out;
+                        }
+                        ret = replace_one_extent(trans, found_root,
+                                                path, extent_key,
+                                                &first_key, ref_path,
+                                                new_extents, nr_extents);
+                } else {
+                        ret = relocate_tree_block(trans, found_root, path,
+                                                  &first_key, ref_path);
+                }
+                if (ret < 0)
+                        goto out;
+        }
+        ret = 0;
+out:
+        btrfs_end_transaction(trans, extent_root);
+        kfree(new_extents);
+        kfree(ref_path);
+        return ret;
+}
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+        u64 num_devices;
+        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+        num_devices = root->fs_info->fs_devices->rw_devices;
+        if (num_devices == 1) {
+                stripped |= BTRFS_BLOCK_GROUP_DUP;
+                stripped = flags & ~stripped;
+                /* turn raid0 into single device chunks */
+                if (flags & BTRFS_BLOCK_GROUP_RAID0)
+                        return stripped;
+                /* turn mirroring into duplication */
+                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                             BTRFS_BLOCK_GROUP_RAID10))
+                        return stripped | BTRFS_BLOCK_GROUP_DUP;
+                return flags;
+        } else {
+                /* they already had raid on here, just return */
+                if (flags & stripped)
+                        return flags;
+                stripped |= BTRFS_BLOCK_GROUP_DUP;
+                stripped = flags & ~stripped;
+                /* switch duplicated blocks with raid1 */
+                if (flags & BTRFS_BLOCK_GROUP_DUP)
+                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
+                /* turn single device chunks into raid0 */
+                return stripped | BTRFS_BLOCK_GROUP_RAID0;
+        }
+        return flags;
+}
+static int __alloc_chunk_for_shrink(struct btrfs_root *root,
+                     struct btrfs_block_group_cache *shrink_block_group,
+                     int force)
+{
+        struct btrfs_trans_handle *trans;
+        u64 new_alloc_flags;
+        u64 calc;
+        spin_lock(&shrink_block_group->lock);
+        if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+                spin_unlock(&shrink_block_group->lock);
+                trans = btrfs_start_transaction(root, 1);
+                spin_lock(&shrink_block_group->lock);
+                new_alloc_flags = update_block_group_flags(root,
+                                                   shrink_block_group->flags);
+                if (new_alloc_flags != shrink_block_group->flags) {
+                        calc =
+                             btrfs_block_group_used(&shrink_block_group->item);
+                } else {
+                        calc = shrink_block_group->key.offset;
+                }
+                spin_unlock(&shrink_block_group->lock);
+                do_chunk_alloc(trans, root->fs_info->extent_root,
+                               calc + 2 * 1024 * 1024, new_alloc_flags, force);
+                btrfs_end_transaction(trans, root);
+        } else
+                spin_unlock(&shrink_block_group->lock);
+        return 0;
+}
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 u64 objectid, u64 size)
+{
+        struct btrfs_path *path;
+        struct btrfs_inode_item *item;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+        if (ret)
+                goto out;
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+        btrfs_set_inode_generation(leaf, item, 1);
+        btrfs_set_inode_size(leaf, item, size);
+        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_block_group_cache *group)
+{
+        struct inode *inode = NULL;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        struct btrfs_key root_key;
+        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+        int err = 0;
+        root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
+        root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+        if (IS_ERR(root))
+                return ERR_CAST(root);
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+        if (err)
+                goto out;
+        err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+        BUG_ON(err);
+        err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+                                       group->key.offset, 0, group->key.offset,
+                                       0, 0, 0);
+        BUG_ON(err);
+        inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+        if (inode->i_state & I_NEW) {
+                BTRFS_I(inode)->root = root;
+                BTRFS_I(inode)->location.objectid = objectid;
+                BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+                BTRFS_I(inode)->location.offset = 0;
+                btrfs_read_locked_inode(inode);
+                unlock_new_inode(inode);
+                BUG_ON(is_bad_inode(inode));
+        } else {
+                BUG_ON(1);
+        }
+        BTRFS_I(inode)->index_cnt = group->key.objectid;
+        err = btrfs_orphan_add(trans, inode);
+out:
+        btrfs_end_transaction(trans, root);
+        if (err) {
+                if (inode)
+                        iput(inode);
+                inode = ERR_PTR(err);
+        }
+        return inode;
+}
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_ordered_extent *ordered;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct list_head list;
+        size_t offset;
+        int ret;
+        u64 disk_bytenr;
+        INIT_LIST_HEAD(&list);
+        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+                                       disk_bytenr + len - 1, &list);
+        while (!list_empty(&list)) {
+                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+                list_del_init(&sums->list);
+                sector_sum = sums->sums;
+                sums->bytenr = ordered->start;
+                offset = 0;
+                while (offset < sums->len) {
+                        sector_sum->bytenr += ordered->start - disk_bytenr;
+                        sector_sum++;
+                        offset += root->sectorsize;
+                }
+                btrfs_add_ordered_sum(inode, ordered, sums);
+        }
+        btrfs_put_ordered_extent(ordered);
+        return 0;
+}
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        struct btrfs_fs_info *info = root->fs_info;
+        struct extent_buffer *leaf;
+        struct inode *reloc_inode;
+        struct btrfs_block_group_cache *block_group;
+        struct btrfs_key key;
+        u64 skipped;
+        u64 cur_byte;
+        u64 total_found;
+        u32 nritems;
+        int ret;
+        int progress;
+        int pass = 0;
+        root = root->fs_info->extent_root;
+        block_group = btrfs_lookup_block_group(info, group_start);
+        BUG_ON(!block_group);
+        printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
+               (unsigned long long)block_group->key.objectid,
+               (unsigned long long)block_group->flags);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        reloc_inode = create_reloc_inode(info, block_group);
+        BUG_ON(IS_ERR(reloc_inode));
+        __alloc_chunk_for_shrink(root, block_group, 1);
+        set_block_group_readonly(block_group);
+        btrfs_start_delalloc_inodes(info->tree_root);
+        btrfs_wait_ordered_extents(info->tree_root, 0);
+again:
+        skipped = 0;
+        total_found = 0;
+        progress = 0;
+        key.objectid = block_group->key.objectid;
+        key.offset = 0;
+        key.type = 0;
+        cur_byte = key.objectid;
+        trans = btrfs_start_transaction(info->tree_root, 1);
+        btrfs_commit_transaction(trans, info->tree_root);
+        mutex_lock(&root->fs_info->cleaner_mutex);
+        btrfs_clean_old_snapshots(info->tree_root);
+        btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
+        mutex_unlock(&root->fs_info->cleaner_mutex);
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+next:
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret == 1) {
+                                ret = 0;
+                                break;
+                        }
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid >= block_group->key.objectid +
+                    block_group->key.offset)
+                        break;
+                if (progress && need_resched()) {
+                        btrfs_release_path(root, path);
+                        cond_resched();
+                        progress = 0;
+                        continue;
+                }
+                progress = 1;
+                if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
+                    key.objectid + key.offset <= cur_byte) {
+                        path->slots[0]++;
+                        goto next;
+                }
+                total_found++;
+                cur_byte = key.objectid + key.offset;
+                btrfs_release_path(root, path);
+                __alloc_chunk_for_shrink(root, block_group, 0);
+                ret = relocate_one_extent(root, path, &key, block_group,
+                                          reloc_inode, pass);
+                BUG_ON(ret < 0);
+                if (ret > 0)
+                        skipped++;
+                key.objectid = cur_byte;
+                key.type = 0;
+                key.offset = 0;
+        }
+        btrfs_release_path(root, path);
+        if (pass == 0) {
+                btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
+                invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
+        }
+        if (total_found > 0) {
+                printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
+                       (unsigned long long)total_found, pass);
+                pass++;
+                if (total_found == skipped && pass > 2) {
+                        iput(reloc_inode);
+                        reloc_inode = create_reloc_inode(info, block_group);
+                        pass = 0;
+                }
+                goto again;
+        }
+        /* delete reloc_inode */
+        iput(reloc_inode);
+        /* unpin extents in this range */
+        trans = btrfs_start_transaction(info->tree_root, 1);
+        btrfs_commit_transaction(trans, info->tree_root);
+        spin_lock(&block_group->lock);
+        WARN_ON(block_group->pinned > 0);
+        WARN_ON(block_group->reserved > 0);
+        WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
+        spin_unlock(&block_group->lock);
+        put_block_group(block_group);
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int find_first_block_group(struct btrfs_root *root,
+                struct btrfs_path *path, struct btrfs_key *key)
+{
+        int ret = 0;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        int slot;
+        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        while (1) {
+                slot = path->slots[0];
+                leaf = path->nodes[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto out;
+                        break;
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (found_key.objectid >= key->objectid &&
+                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+                        ret = 0;
+                        goto out;
+                }
+                path->slots[0]++;
+        }
+        ret = -ENOENT;
+out:
+        return ret;
+}
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+        struct btrfs_block_group_cache *block_group;
+        struct rb_node *n;
+        spin_lock(&info->block_group_cache_lock);
+        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+                block_group = rb_entry(n, struct btrfs_block_group_cache,
+                                       cache_node);
+                rb_erase(&block_group->cache_node,
+                         &info->block_group_cache_tree);
+                spin_unlock(&info->block_group_cache_lock);
+                btrfs_remove_free_space_cache(block_group);
+                down_write(&block_group->space_info->groups_sem);
+                list_del(&block_group->list);
+                up_write(&block_group->space_info->groups_sem);
+                WARN_ON(atomic_read(&block_group->count) != 1);
+                kfree(block_group);
+                spin_lock(&info->block_group_cache_lock);
+        }
+        spin_unlock(&info->block_group_cache_lock);
+        return 0;
+}
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_block_group_cache *cache;
+        struct btrfs_fs_info *info = root->fs_info;
+        struct btrfs_space_info *space_info;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        root = info->extent_root;
+        key.objectid = 0;
+        key.offset = 0;
+        btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        while (1) {
+                ret = find_first_block_group(root, path, &key);
+                if (ret > 0) {
+                        ret = 0;
+                        goto error;
+                }
+                if (ret != 0)
+                        goto error;
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                cache = kzalloc(sizeof(*cache), GFP_NOFS);
+                if (!cache) {
+                        ret = -ENOMEM;
+                        break;
+                }
+                atomic_set(&cache->count, 1);
+                spin_lock_init(&cache->lock);
+                mutex_init(&cache->alloc_mutex);
+                mutex_init(&cache->cache_mutex);
+                INIT_LIST_HEAD(&cache->list);
+                read_extent_buffer(leaf, &cache->item,
+                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
+                                   sizeof(cache->item));
+                memcpy(&cache->key, &found_key, sizeof(found_key));
+                key.objectid = found_key.objectid + found_key.offset;
+                btrfs_release_path(root, path);
+                cache->flags = btrfs_block_group_flags(&cache->item);
+                ret = update_space_info(info, cache->flags, found_key.offset,
+                                        btrfs_block_group_used(&cache->item),
+                                        &space_info);
+                BUG_ON(ret);
+                cache->space_info = space_info;
+                down_write(&space_info->groups_sem);
+                list_add_tail(&cache->list, &space_info->block_groups);
+                up_write(&space_info->groups_sem);
+                ret = btrfs_add_block_group_cache(root->fs_info, cache);
+                BUG_ON(ret);
+                set_avail_alloc_bits(root->fs_info, cache->flags);
+                if (btrfs_chunk_readonly(root, cache->key.objectid))
+                        set_block_group_readonly(cache);
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 bytes_used,
+                           u64 type, u64 chunk_objectid, u64 chunk_offset,
+                           u64 size)
+{
+        int ret;
+        struct btrfs_root *extent_root;
+        struct btrfs_block_group_cache *cache;
+        extent_root = root->fs_info->extent_root;
+        root->fs_info->last_trans_new_blockgroup = trans->transid;
+        cache = kzalloc(sizeof(*cache), GFP_NOFS);
+        if (!cache)
+                return -ENOMEM;
+        cache->key.objectid = chunk_offset;
+        cache->key.offset = size;
+        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+        atomic_set(&cache->count, 1);
+        spin_lock_init(&cache->lock);
+        mutex_init(&cache->alloc_mutex);
+        mutex_init(&cache->cache_mutex);
+        INIT_LIST_HEAD(&cache->list);
+        btrfs_set_block_group_used(&cache->item, bytes_used);
+        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+        cache->flags = type;
+        btrfs_set_block_group_flags(&cache->item, type);
+        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+                                &cache->space_info);
+        BUG_ON(ret);
+        down_write(&cache->space_info->groups_sem);
+        list_add_tail(&cache->list, &cache->space_info->block_groups);
+        up_write(&cache->space_info->groups_sem);
+        ret = btrfs_add_block_group_cache(root->fs_info, cache);
+        BUG_ON(ret);
+        ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
+                                sizeof(cache->item));
+        BUG_ON(ret);
+        finish_current_insert(trans, extent_root, 0);
+        ret = del_pending_extents(trans, extent_root, 0);
+        BUG_ON(ret);
+        set_avail_alloc_bits(extent_root->fs_info, type);
+        return 0;
+}
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 group_start)
+{
+        struct btrfs_path *path;
+        struct btrfs_block_group_cache *block_group;
+        struct btrfs_key key;
+        int ret;
+        root = root->fs_info->extent_root;
+        block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+        BUG_ON(!block_group);
+        BUG_ON(!block_group->ro);
+        memcpy(&key, &block_group->key, sizeof(key));
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        btrfs_remove_free_space_cache(block_group);
+        rb_erase(&block_group->cache_node,
+                 &root->fs_info->block_group_cache_tree);
+        down_write(&block_group->space_info->groups_sem);
+        list_del(&block_group->list);
+        up_write(&block_group->space_info->groups_sem);
+        spin_lock(&block_group->space_info->lock);
+        block_group->space_info->total_bytes -= block_group->key.offset;
+        block_group->space_info->bytes_readonly -= block_group->key.offset;
+        spin_unlock(&block_group->space_info->lock);
+        block_group->space_info->full = 0;
+        put_block_group(block_group);
+        put_block_group(block_group);
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0)
+                ret = -EIO;
+        if (ret < 0)
+                goto out;
+        ret = btrfs_del_item(trans, root, path);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..e086d407f1fa
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3717 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/version.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "compat.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+                                       unsigned long extra_flags,
+                                       void (*ctor)(void *, struct kmem_cache *,
+                                                    unsigned long));
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+#define LEAK_DEBUG 0
+#ifdef LEAK_DEBUG
+static DEFINE_SPINLOCK(leak_lock);
+#endif
+#define BUFFER_LRU_MAX 64
+struct tree_entry {
+        u64 start;
+        u64 end;
+        struct rb_node rb_node;
+};
+struct extent_page_data {
+        struct bio *bio;
+        struct extent_io_tree *tree;
+        get_extent_t *get_extent;
+        /* tells writepage not to lock the state bits for this range
+         * it still does the unlocking
+         */
+        int extent_locked;
+};
+int __init extent_io_init(void)
+{
+        extent_state_cache = btrfs_cache_create("extent_state",
+                                            sizeof(struct extent_state), 0,
+                                            NULL);
+        if (!extent_state_cache)
+                return -ENOMEM;
+        extent_buffer_cache = btrfs_cache_create("extent_buffers",
+                                            sizeof(struct extent_buffer), 0,
+                                            NULL);
+        if (!extent_buffer_cache)
+                goto free_state_cache;
+        return 0;
+free_state_cache:
+        kmem_cache_destroy(extent_state_cache);
+        return -ENOMEM;
+}
+void extent_io_exit(void)
+{
+        struct extent_state *state;
+        struct extent_buffer *eb;
+        while (!list_empty(&states)) {
+                state = list_entry(states.next, struct extent_state, leak_list);
+                printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+                       "state %lu in tree %p refs %d\n",
+                       (unsigned long long)state->start,
+                       (unsigned long long)state->end,
+                       state->state, state->tree, atomic_read(&state->refs));
+                list_del(&state->leak_list);
+                kmem_cache_free(extent_state_cache, state);
+        }
+        while (!list_empty(&buffers)) {
+                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+                printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+                       "refs %d\n", (unsigned long long)eb->start,
+                       eb->len, atomic_read(&eb->refs));
+                list_del(&eb->leak_list);
+                kmem_cache_free(extent_buffer_cache, eb);
+        }
+        if (extent_state_cache)
+                kmem_cache_destroy(extent_state_cache);
+        if (extent_buffer_cache)
+                kmem_cache_destroy(extent_buffer_cache);
+}
+void extent_io_tree_init(struct extent_io_tree *tree,
+                          struct address_space *mapping, gfp_t mask)
+{
+        tree->state.rb_node = NULL;
+        tree->buffer.rb_node = NULL;
+        tree->ops = NULL;
+        tree->dirty_bytes = 0;
+        spin_lock_init(&tree->lock);
+        spin_lock_init(&tree->buffer_lock);
+        tree->mapping = mapping;
+}
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+        struct extent_state *state;
+#ifdef LEAK_DEBUG
+        unsigned long flags;
+#endif
+        state = kmem_cache_alloc(extent_state_cache, mask);
+        if (!state)
+                return state;
+        state->state = 0;
+        state->private = 0;
+        state->tree = NULL;
+#ifdef LEAK_DEBUG
+        spin_lock_irqsave(&leak_lock, flags);
+        list_add(&state->leak_list, &states);
+        spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+        atomic_set(&state->refs, 1);
+        init_waitqueue_head(&state->wq);
+        return state;
+}
+static void free_extent_state(struct extent_state *state)
+{
+        if (!state)
+                return;
+        if (atomic_dec_and_test(&state->refs)) {
+#ifdef LEAK_DEBUG
+                unsigned long flags;
+#endif
+                WARN_ON(state->tree);
+#ifdef LEAK_DEBUG
+                spin_lock_irqsave(&leak_lock, flags);
+                list_del(&state->leak_list);
+                spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+                kmem_cache_free(extent_state_cache, state);
+        }
+}
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct tree_entry *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct tree_entry, rb_node);
+                if (offset < entry->start)
+                        p = &(*p)->rb_left;
+                else if (offset > entry->end)
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        entry = rb_entry(node, struct tree_entry, rb_node);
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
+                                     struct rb_node **prev_ret,
+                                     struct rb_node **next_ret)
+{
+        struct rb_root *root = &tree->state;
+        struct rb_node *n = root->rb_node;
+        struct rb_node *prev = NULL;
+        struct rb_node *orig_prev = NULL;
+        struct tree_entry *entry;
+        struct tree_entry *prev_entry = NULL;
+        while (n) {
+                entry = rb_entry(n, struct tree_entry, rb_node);
+                prev = n;
+                prev_entry = entry;
+                if (offset < entry->start)
+                        n = n->rb_left;
+                else if (offset > entry->end)
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        if (prev_ret) {
+                orig_prev = prev;
+                while (prev && offset > prev_entry->end) {
+                        prev = rb_next(prev);
+                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+                }
+                *prev_ret = prev;
+                prev = orig_prev;
+        }
+        if (next_ret) {
+                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+                while (prev && offset < prev_entry->start) {
+                        prev = rb_prev(prev);
+                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+                }
+                *next_ret = prev;
+        }
+        return NULL;
+}
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+                                          u64 offset)
+{
+        struct rb_node *prev = NULL;
+        struct rb_node *ret;
+        ret = __etree_search(tree, offset, &prev, NULL);
+        if (!ret)
+                return prev;
+        return ret;
+}
+static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
+                                          u64 offset, struct rb_node *node)
+{
+        struct rb_root *root = &tree->buffer;
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct extent_buffer *eb;
+        while (*p) {
+                parent = *p;
+                eb = rb_entry(parent, struct extent_buffer, rb_node);
+                if (offset < eb->start)
+                        p = &(*p)->rb_left;
+                else if (offset > eb->start)
+                        p = &(*p)->rb_right;
+                else
+                        return eb;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+                                           u64 offset)
+{
+        struct rb_root *root = &tree->buffer;
+        struct rb_node *n = root->rb_node;
+        struct extent_buffer *eb;
+        while (n) {
+                eb = rb_entry(n, struct extent_buffer, rb_node);
+                if (offset < eb->start)
+                        n = n->rb_left;
+                else if (offset > eb->start)
+                        n = n->rb_right;
+                else
+                        return eb;
+        }
+        return NULL;
+}
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree.  Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_io_tree *tree,
+                       struct extent_state *state)
+{
+        struct extent_state *other;
+        struct rb_node *other_node;
+        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+                return 0;
+        other_node = rb_prev(&state->rb_node);
+        if (other_node) {
+                other = rb_entry(other_node, struct extent_state, rb_node);
+                if (other->end == state->start - 1 &&
+                    other->state == state->state) {
+                        state->start = other->start;
+                        other->tree = NULL;
+                        rb_erase(&other->rb_node, &tree->state);
+                        free_extent_state(other);
+                }
+        }
+        other_node = rb_next(&state->rb_node);
+        if (other_node) {
+                other = rb_entry(other_node, struct extent_state, rb_node);
+                if (other->start == state->end + 1 &&
+                    other->state == state->state) {
+                        other->start = state->start;
+                        state->tree = NULL;
+                        rb_erase(&state->rb_node, &tree->state);
+                        free_extent_state(state);
+                }
+        }
+        return 0;
+}
+static void set_state_cb(struct extent_io_tree *tree,
+                         struct extent_state *state,
+                         unsigned long bits)
+{
+        if (tree->ops && tree->ops->set_bit_hook) {
+                tree->ops->set_bit_hook(tree->mapping->host, state->start,
+                                        state->end, state->state, bits);
+        }
+}
+static void clear_state_cb(struct extent_io_tree *tree,
+                           struct extent_state *state,
+                           unsigned long bits)
+{
+        if (tree->ops && tree->ops->clear_bit_hook) {
+                tree->ops->clear_bit_hook(tree->mapping->host, state->start,
+                                          state->end, state->state, bits);
+        }
+}
+/*
+ * insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+                        struct extent_state *state, u64 start, u64 end,
+                        int bits)
+{
+        struct rb_node *node;
+        if (end < start) {
+                printk(KERN_ERR "btrfs end < start %llu %llu\n",
+                       (unsigned long long)end,
+                       (unsigned long long)start);
+                WARN_ON(1);
+        }
+        if (bits & EXTENT_DIRTY)
+                tree->dirty_bytes += end - start + 1;
+        set_state_cb(tree, state, bits);
+        state->state |= bits;
+        state->start = start;
+        state->end = end;
+        node = tree_insert(&tree->state, end, &state->rb_node);
+        if (node) {
+                struct extent_state *found;
+                found = rb_entry(node, struct extent_state, rb_node);
+                printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+                       "%llu %llu\n", (unsigned long long)found->start,
+                       (unsigned long long)found->end,
+                       (unsigned long long)start, (unsigned long long)end);
+                free_extent_state(state);
+                return -EEXIST;
+        }
+        state->tree = tree;
+        merge_state(tree, state);
+        return 0;
+}
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+                       struct extent_state *prealloc, u64 split)
+{
+        struct rb_node *node;
+        prealloc->start = orig->start;
+        prealloc->end = split - 1;
+        prealloc->state = orig->state;
+        orig->start = split;
+        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+        if (node) {
+                struct extent_state *found;
+                found = rb_entry(node, struct extent_state, rb_node);
+                free_extent_state(prealloc);
+                return -EEXIST;
+        }
+        prealloc->tree = tree;
+        return 0;
+}
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_io_tree *tree,
+                            struct extent_state *state, int bits, int wake,
+                            int delete)
+{
+        int ret = state->state & bits;
+        if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+                u64 range = state->end - state->start + 1;
+                WARN_ON(range > tree->dirty_bytes);
+                tree->dirty_bytes -= range;
+        }
+        clear_state_cb(tree, state, bits);
+        state->state &= ~bits;
+        if (wake)
+                wake_up(&state->wq);
+        if (delete || state->state == 0) {
+                if (state->tree) {
+                        clear_state_cb(tree, state, state->state);
+                        rb_erase(&state->rb_node, &tree->state);
+                        state->tree = NULL;
+                        free_extent_state(state);
+                } else {
+                        WARN_ON(1);
+                }
+        } else {
+                merge_state(tree, state);
+        }
+        return ret;
+}
+/*
+ * clear some bits on a range in the tree.  This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                     int bits, int wake, int delete, gfp_t mask)
+{
+        struct extent_state *state;
+        struct extent_state *prealloc = NULL;
+        struct rb_node *node;
+        int err;
+        int set = 0;
+again:
+        if (!prealloc && (mask & __GFP_WAIT)) {
+                prealloc = alloc_extent_state(mask);
+                if (!prealloc)
+                        return -ENOMEM;
+        }
+        spin_lock(&tree->lock);
+        /*
+         * this search will find the extents that end after
+         * our range starts
+         */
+        node = tree_search(tree, start);
+        if (!node)
+                goto out;
+        state = rb_entry(node, struct extent_state, rb_node);
+        if (state->start > end)
+                goto out;
+        WARN_ON(state->end < start);
+        /*
+         *     | ---- desired range ---- |
+         *  | state | or
+         *  | ------------- state -------------- |
+         *
+         * We need to split the extent we found, and may flip
+         * bits on second half.
+         *
+         * If the extent we found extends past our range, we
+         * just split and search again.  It'll get split again
+         * the next time though.
+         *
+         * If the extent we found is inside our range, we clear
+         * the desired bit on it.
+         */
+        if (state->start < start) {
+                if (!prealloc)
+                        prealloc = alloc_extent_state(GFP_ATOMIC);
+                err = split_state(tree, state, prealloc, start);
+                BUG_ON(err == -EEXIST);
+                prealloc = NULL;
+                if (err)
+                        goto out;
+                if (state->end <= end) {
+                        start = state->end + 1;
+                        set |= clear_state_bit(tree, state, bits,
+                                        wake, delete);
+                } else {
+                        start = state->start;
+                }
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *                        | state |
+         * We need to split the extent, and clear the bit
+         * on the first half
+         */
+        if (state->start <= end && state->end > end) {
+                if (!prealloc)
+                        prealloc = alloc_extent_state(GFP_ATOMIC);
+                err = split_state(tree, state, prealloc, end + 1);
+                BUG_ON(err == -EEXIST);
+                if (wake)
+                        wake_up(&state->wq);
+                set |= clear_state_bit(tree, prealloc, bits,
+                                       wake, delete);
+                prealloc = NULL;
+                goto out;
+        }
+        start = state->end + 1;
+        set |= clear_state_bit(tree, state, bits, wake, delete);
+        goto search_again;
+out:
+        spin_unlock(&tree->lock);
+        if (prealloc)
+                free_extent_state(prealloc);
+        return set;
+search_again:
+        if (start > end)
+                goto out;
+        spin_unlock(&tree->lock);
+        if (mask & __GFP_WAIT)
+                cond_resched();
+        goto again;
+}
+static int wait_on_state(struct extent_io_tree *tree,
+                         struct extent_state *state)
+                __releases(tree->lock)
+                __acquires(tree->lock)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+        spin_unlock(&tree->lock);
+        schedule();
+        spin_lock(&tree->lock);
+        finish_wait(&state->wq, &wait);
+        return 0;
+}
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+{
+        struct extent_state *state;
+        struct rb_node *node;
+        spin_lock(&tree->lock);
+again:
+        while (1) {
+                /*
+                 * this search will find all the extents that end after
+                 * our range starts
+                 */
+                node = tree_search(tree, start);
+                if (!node)
+                        break;
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (state->start > end)
+                        goto out;
+                if (state->state & bits) {
+                        start = state->start;
+                        atomic_inc(&state->refs);
+                        wait_on_state(tree, state);
+                        free_extent_state(state);
+                        goto again;
+                }
+                start = state->end + 1;
+                if (start > end)
+                        break;
+                if (need_resched()) {
+                        spin_unlock(&tree->lock);
+                        cond_resched();
+                        spin_lock(&tree->lock);
+                }
+        }
+out:
+        spin_unlock(&tree->lock);
+        return 0;
+}
+static void set_state_bits(struct extent_io_tree *tree,
+                           struct extent_state *state,
+                           int bits)
+{
+        if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+                u64 range = state->end - state->start + 1;
+                tree->dirty_bytes += range;
+        }
+        set_state_cb(tree, state, bits);
+        state->state |= bits;
+}
+/*
+ * set some bits on a range in the tree.  This may require allocations
+ * or sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+ * range already has the desired bits set.  The start of the existing
+ * range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive
+ * This takes the tree lock.
+ */
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                          int bits, int exclusive, u64 *failed_start,
+                          gfp_t mask)
+{
+        struct extent_state *state;
+        struct extent_state *prealloc = NULL;
+        struct rb_node *node;
+        int err = 0;
+        int set;
+        u64 last_start;
+        u64 last_end;
+again:
+        if (!prealloc && (mask & __GFP_WAIT)) {
+                prealloc = alloc_extent_state(mask);
+                if (!prealloc)
+                        return -ENOMEM;
+        }
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node) {
+                err = insert_state(tree, prealloc, start, end, bits);
+                prealloc = NULL;
+                BUG_ON(err == -EEXIST);
+                goto out;
+        }
+        state = rb_entry(node, struct extent_state, rb_node);
+        last_start = state->start;
+        last_end = state->end;
+        /*
+         * | ---- desired range ---- |
+         * | state |
+         *
+         * Just lock what we found and keep going
+         */
+        if (state->start == start && state->end <= end) {
+                set = state->state & bits;
+                if (set && exclusive) {
+                        *failed_start = state->start;
+                        err = -EEXIST;
+                        goto out;
+                }
+                set_state_bits(tree, state, bits);
+                start = state->end + 1;
+                merge_state(tree, state);
+                goto search_again;
+        }
+        /*
+         *     | ---- desired range ---- |
+         * | state |
+         *   or
+         * | ------------- state -------------- |
+         *
+         * We need to split the extent we found, and may flip bits on
+         * second half.
+         *
+         * If the extent we found extends past our
+         * range, we just split and search again.  It'll get split
+         * again the next time though.
+         *
+         * If the extent we found is inside our range, we set the
+         * desired bit on it.
+         */
+        if (state->start < start) {
+                set = state->state & bits;
+                if (exclusive && set) {
+                        *failed_start = start;
+                        err = -EEXIST;
+                        goto out;
+                }
+                err = split_state(tree, state, prealloc, start);
+                BUG_ON(err == -EEXIST);
+                prealloc = NULL;
+                if (err)
+                        goto out;
+                if (state->end <= end) {
+                        set_state_bits(tree, state, bits);
+                        start = state->end + 1;
+                        merge_state(tree, state);
+                } else {
+                        start = state->start;
+                }
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *     | state | or               | state |
+         *
+         * There's a hole, we need to insert something in it and
+         * ignore the extent we found.
+         */
+        if (state->start > start) {
+                u64 this_end;
+                if (end < last_start)
+                        this_end = end;
+                else
+                        this_end = last_start - 1;
+                err = insert_state(tree, prealloc, start, this_end,
+                                   bits);
+                prealloc = NULL;
+                BUG_ON(err == -EEXIST);
+                if (err)
+                        goto out;
+                start = this_end + 1;
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *                        | state |
+         * We need to split the extent, and set the bit
+         * on the first half
+         */
+        if (state->start <= end && state->end > end) {
+                set = state->state & bits;
+                if (exclusive && set) {
+                        *failed_start = start;
+                        err = -EEXIST;
+                        goto out;
+                }
+                err = split_state(tree, state, prealloc, end + 1);
+                BUG_ON(err == -EEXIST);
+                set_state_bits(tree, prealloc, bits);
+                merge_state(tree, prealloc);
+                prealloc = NULL;
+                goto out;
+        }
+        goto search_again;
+out:
+        spin_unlock(&tree->lock);
+        if (prealloc)
+                free_extent_state(prealloc);
+        return err;
+search_again:
+        if (start > end)
+                goto out;
+        spin_unlock(&tree->lock);
+        if (mask & __GFP_WAIT)
+                cond_resched();
+        goto again;
+}
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+                              mask);
+}
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+}
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                    int bits, gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, bits, 0, NULL,
+                              mask);
+}
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                      int bits, gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+}
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask)
+{
+        return set_extent_bit(tree, start, end,
+                              EXTENT_DELALLOC | EXTENT_DIRTY,
+                              0, NULL, mask);
+}
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end,
+                                EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
+}
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                         gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+}
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+                              mask);
+}
+static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+}
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                        gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+                              mask);
+}
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+                                 u64 end, gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+}
+static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+                         gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+                              0, NULL, mask);
+}
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+                                  u64 end, gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+}
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+        int err;
+        u64 failed_start;
+        while (1) {
+                err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+                                     &failed_start, mask);
+                if (err == -EEXIST && (mask & __GFP_WAIT)) {
+                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+                        start = failed_start;
+                } else {
+                        break;
+                }
+                WARN_ON(start > end);
+        }
+        return err;
+}
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+                    gfp_t mask)
+{
+        int err;
+        u64 failed_start;
+        err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+                             &failed_start, mask);
+        if (err == -EEXIST) {
+                if (failed_start > start)
+                        clear_extent_bit(tree, start, failed_start - 1,
+                                         EXTENT_LOCKED, 1, 0, mask);
+                return 0;
+        }
+        return 1;
+}
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+                  gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+}
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        while (index <= end_index) {
+                page = find_get_page(tree->mapping, index);
+                BUG_ON(!page);
+                __set_page_dirty_nobuffers(page);
+                page_cache_release(page);
+                index++;
+        }
+        set_extent_dirty(tree, start, end, GFP_NOFS);
+        return 0;
+}
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        while (index <= end_index) {
+                page = find_get_page(tree->mapping, index);
+                BUG_ON(!page);
+                set_page_writeback(page);
+                page_cache_release(page);
+                index++;
+        }
+        set_extent_writeback(tree, start, end, GFP_NOFS);
+        return 0;
+}
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+                          u64 *start_ret, u64 *end_ret, int bits)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        int ret = 1;
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node)
+                goto out;
+        while (1) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (state->end >= start && (state->state & bits)) {
+                        *start_ret = state->start;
+                        *end_ret = state->end;
+                        ret = 0;
+                        break;
+                }
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        spin_unlock(&tree->lock);
+        return ret;
+}
+/* find the first state struct with 'bits' set after 'start', and
+ * return it.  tree->lock must be held.  NULL will returned if
+ * nothing was found after 'start'
+ */
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+                                                 u64 start, int bits)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node)
+                goto out;
+        while (1) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (state->end >= start && (state->state & bits))
+                        return state;
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        return NULL;
+}
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+                                        u64 *start, u64 *end, u64 max_bytes)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        u64 cur_start = *start;
+        u64 found = 0;
+        u64 total_bytes = 0;
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, cur_start);
+        if (!node) {
+                if (!found)
+                        *end = (u64)-1;
+                goto out;
+        }
+        while (1) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (found && (state->start != cur_start ||
+                              (state->state & EXTENT_BOUNDARY))) {
+                        goto out;
+                }
+                if (!(state->state & EXTENT_DELALLOC)) {
+                        if (!found)
+                                *end = state->end;
+                        goto out;
+                }
+                if (!found)
+                        *start = state->start;
+                found++;
+                *end = state->end;
+                cur_start = state->end + 1;
+                node = rb_next(node);
+                if (!node)
+                        break;
+                total_bytes += state->end - state->start + 1;
+                if (total_bytes >= max_bytes)
+                        break;
+        }
+out:
+        spin_unlock(&tree->lock);
+        return found;
+}
+static noinline int __unlock_for_delalloc(struct inode *inode,
+                                          struct page *locked_page,
+                                          u64 start, u64 end)
+{
+        int ret;
+        struct page *pages[16];
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        unsigned long nr_pages = end_index - index + 1;
+        int i;
+        if (index == locked_page->index && end_index == index)
+                return 0;
+        while (nr_pages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min_t(unsigned long, nr_pages,
+                                     ARRAY_SIZE(pages)), pages);
+                for (i = 0; i < ret; i++) {
+                        if (pages[i] != locked_page)
+                                unlock_page(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                nr_pages -= ret;
+                index += ret;
+                cond_resched();
+        }
+        return 0;
+}
+static noinline int lock_delalloc_pages(struct inode *inode,
+                                        struct page *locked_page,
+                                        u64 delalloc_start,
+                                        u64 delalloc_end)
+{
+        unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+        unsigned long start_index = index;
+        unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+        unsigned long pages_locked = 0;
+        struct page *pages[16];
+        unsigned long nrpages;
+        int ret;
+        int i;
+        /* the caller is responsible for locking the start index */
+        if (index == locked_page->index && index == end_index)
+                return 0;
+        /* skip the page at the start index */
+        nrpages = end_index - index + 1;
+        while (nrpages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min_t(unsigned long,
+                                     nrpages, ARRAY_SIZE(pages)), pages);
+                if (ret == 0) {
+                        ret = -EAGAIN;
+                        goto done;
+                }
+                /* now we have an array of pages, lock them all */
+                for (i = 0; i < ret; i++) {
+                        /*
+                         * the caller is taking responsibility for
+                         * locked_page
+                         */
+                        if (pages[i] != locked_page) {
+                                lock_page(pages[i]);
+                                if (!PageDirty(pages[i]) ||
+                                    pages[i]->mapping != inode->i_mapping) {
+                                        ret = -EAGAIN;
+                                        unlock_page(pages[i]);
+                                        page_cache_release(pages[i]);
+                                        goto done;
+                                }
+                        }
+                        page_cache_release(pages[i]);
+                        pages_locked++;
+                }
+                nrpages -= ret;
+                index += ret;
+                cond_resched();
+        }
+        ret = 0;
+done:
+        if (ret && pages_locked) {
+                __unlock_for_delalloc(inode, locked_page,
+                              delalloc_start,
+                              ((u64)(start_index + pages_locked - 1)) <<
+                              PAGE_CACHE_SHIFT);
+        }
+        return ret;
+}
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+                                             struct extent_io_tree *tree,
+                                             struct page *locked_page,
+                                             u64 *start, u64 *end,
+                                             u64 max_bytes)
+{
+        u64 delalloc_start;
+        u64 delalloc_end;
+        u64 found;
+        int ret;
+        int loops = 0;
+again:
+        /* step one, find a bunch of delalloc bytes starting at start */
+        delalloc_start = *start;
+        delalloc_end = 0;
+        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+                                    max_bytes);
+        if (!found || delalloc_end <= *start) {
+                *start = delalloc_start;
+                *end = delalloc_end;
+                return found;
+        }
+        /*
+         * start comes from the offset of locked_page.  We have to lock
+         * pages in order, so we can't process delalloc bytes before
+         * locked_page
+         */
+        if (delalloc_start < *start)
+                delalloc_start = *start;
+        /*
+         * make sure to limit the number of pages we try to lock down
+         * if we're looping.
+         */
+        if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
+                delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+        /* step two, lock all the pages after the page that has start */
+        ret = lock_delalloc_pages(inode, locked_page,
+                                  delalloc_start, delalloc_end);
+        if (ret == -EAGAIN) {
+                /* some of the pages are gone, lets avoid looping by
+                 * shortening the size of the delalloc range we're searching
+                 */
+                if (!loops) {
+                        unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+                        max_bytes = PAGE_CACHE_SIZE - offset;
+                        loops = 1;
+                        goto again;
+                } else {
+                        found = 0;
+                        goto out_failed;
+                }
+        }
+        BUG_ON(ret);
+        /* step three, lock the state bits for the whole range */
+        lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+        /* then test to make sure it is all still delalloc */
+        ret = test_range_bit(tree, delalloc_start, delalloc_end,
+                             EXTENT_DELALLOC, 1);
+        if (!ret) {
+                unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+                __unlock_for_delalloc(inode, locked_page,
+                              delalloc_start, delalloc_end);
+                cond_resched();
+                goto again;
+        }
+        *start = delalloc_start;
+        *end = delalloc_end;
+out_failed:
+        return found;
+}
+int extent_clear_unlock_delalloc(struct inode *inode,
+                                struct extent_io_tree *tree,
+                                u64 start, u64 end, struct page *locked_page,
+                                int unlock_pages,
+                                int clear_unlock,
+                                int clear_delalloc, int clear_dirty,
+                                int set_writeback,
+                                int end_writeback)
+{
+        int ret;
+        struct page *pages[16];
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        unsigned long nr_pages = end_index - index + 1;
+        int i;
+        int clear_bits = 0;
+        if (clear_unlock)
+                clear_bits |= EXTENT_LOCKED;
+        if (clear_dirty)
+                clear_bits |= EXTENT_DIRTY;
+        if (clear_delalloc)
+                clear_bits |= EXTENT_DELALLOC;
+        clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+        if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+                return 0;
+        while (nr_pages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min_t(unsigned long,
+                                     nr_pages, ARRAY_SIZE(pages)), pages);
+                for (i = 0; i < ret; i++) {
+                        if (pages[i] == locked_page) {
+                                page_cache_release(pages[i]);
+                                continue;
+                        }
+                        if (clear_dirty)
+                                clear_page_dirty_for_io(pages[i]);
+                        if (set_writeback)
+                                set_page_writeback(pages[i]);
+                        if (end_writeback)
+                                end_page_writeback(pages[i]);
+                        if (unlock_pages)
+                                unlock_page(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                nr_pages -= ret;
+                index += ret;
+                cond_resched();
+        }
+        return 0;
+}
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set.  This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached.  The total number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+                     u64 *start, u64 search_end, u64 max_bytes,
+                     unsigned long bits)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        u64 cur_start = *start;
+        u64 total_bytes = 0;
+        int found = 0;
+        if (search_end <= cur_start) {
+                WARN_ON(1);
+                return 0;
+        }
+        spin_lock(&tree->lock);
+        if (cur_start == 0 && bits == EXTENT_DIRTY) {
+                total_bytes = tree->dirty_bytes;
+                goto out;
+        }
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, cur_start);
+        if (!node)
+                goto out;
+        while (1) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (state->start > search_end)
+                        break;
+                if (state->end >= cur_start && (state->state & bits)) {
+                        total_bytes += min(search_end, state->end) + 1 -
+                                       max(cur_start, state->start);
+                        if (total_bytes >= max_bytes)
+                                break;
+                        if (!found) {
+                                *start = state->start;
+                                found = 1;
+                        }
+                }
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        spin_unlock(&tree->lock);
+        return total_bytes;
+}
+#if 0
+/*
+ * helper function to lock both pages and extents in the tree.
+ * pages must be locked first.
+ */
+static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        int err;
+        while (index <= end_index) {
+                page = grab_cache_page(tree->mapping, index);
+                if (!page) {
+                        err = -ENOMEM;
+                        goto failed;
+                }
+                if (IS_ERR(page)) {
+                        err = PTR_ERR(page);
+                        goto failed;
+                }
+                index++;
+        }
+        lock_extent(tree, start, end, GFP_NOFS);
+        return 0;
+failed:
+        /*
+         * we failed above in getting the page at 'index', so we undo here
+         * up to but not including the page at 'index'
+         */
+        end_index = index;
+        index = start >> PAGE_CACHE_SHIFT;
+        while (index < end_index) {
+                page = find_get_page(tree->mapping, index);
+                unlock_page(page);
+                page_cache_release(page);
+                index++;
+        }
+        return err;
+}
+/*
+ * helper function to unlock both pages and extents in the tree.
+ */
+static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        while (index <= end_index) {
+                page = find_get_page(tree->mapping, index);
+                unlock_page(page);
+                page_cache_release(page);
+                index++;
+        }
+        unlock_extent(tree, start, end, GFP_NOFS);
+        return 0;
+}
+#endif
+/*
+ * set the private field for a given byte offset in the tree.  If there isn't
+ * an extent_state there already, this does nothing.
+ */
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        int ret = 0;
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node) {
+                ret = -ENOENT;
+                goto out;
+        }
+        state = rb_entry(node, struct extent_state, rb_node);
+        if (state->start != start) {
+                ret = -ENOENT;
+                goto out;
+        }
+        state->private = private;
+out:
+        spin_unlock(&tree->lock);
+        return ret;
+}
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        int ret = 0;
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node) {
+                ret = -ENOENT;
+                goto out;
+        }
+        state = rb_entry(node, struct extent_state, rb_node);
+        if (state->start != start) {
+                ret = -ENOENT;
+                goto out;
+        }
+        *private = state->private;
+out:
+        spin_unlock(&tree->lock);
+        return ret;
+}
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if every extent in the tree
+ * has the bits set.  Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                   int bits, int filled)
+{
+        struct extent_state *state = NULL;
+        struct rb_node *node;
+        int bitset = 0;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, start);
+        while (node && start <= end) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (filled && state->start > start) {
+                        bitset = 0;
+                        break;
+                }
+                if (state->start > end)
+                        break;
+                if (state->state & bits) {
+                        bitset = 1;
+                        if (!filled)
+                                break;
+                } else if (filled) {
+                        bitset = 0;
+                        break;
+                }
+                start = state->end + 1;
+                if (start > end)
+                        break;
+                node = rb_next(node);
+                if (!node) {
+                        if (filled)
+                                bitset = 0;
+                        break;
+                }
+        }
+        spin_unlock(&tree->lock);
+        return bitset;
+}
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_io_tree *tree,
+                               struct page *page)
+{
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+                SetPageUptodate(page);
+        return 0;
+}
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_io_tree *tree,
+                             struct page *page)
+{
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+                unlock_page(page);
+        return 0;
+}
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_io_tree *tree,
+                             struct page *page)
+{
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+                end_page_writeback(page);
+        return 0;
+}
+/* lots and lots of room for performance fixes in the end_bio funcs */
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_writepage(struct bio *bio, int err)
+{
+        int uptodate = err == 0;
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct extent_io_tree *tree;
+        u64 start;
+        u64 end;
+        int whole_page;
+        int ret;
+        do {
+                struct page *page = bvec->bv_page;
+                tree = &BTRFS_I(page->mapping->host)->io_tree;
+                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+                         bvec->bv_offset;
+                end = start + bvec->bv_len - 1;
+                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+                        whole_page = 1;
+                else
+                        whole_page = 0;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (tree->ops && tree->ops->writepage_end_io_hook) {
+                        ret = tree->ops->writepage_end_io_hook(page, start,
+                                                       end, NULL, uptodate);
+                        if (ret)
+                                uptodate = 0;
+                }
+                if (!uptodate && tree->ops &&
+                    tree->ops->writepage_io_failed_hook) {
+                        ret = tree->ops->writepage_io_failed_hook(bio, page,
+                                                         start, end, NULL);
+                        if (ret == 0) {
+                                uptodate = (err == 0);
+                                continue;
+                        }
+                }
+                if (!uptodate) {
+                        clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+                        ClearPageUptodate(page);
+                        SetPageError(page);
+                }
+                clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+                if (whole_page)
+                        end_page_writeback(page);
+                else
+                        check_page_writeback(tree, page);
+        } while (bvec >= bio->bi_io_vec);
+        bio_put(bio);
+}
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_readpage(struct bio *bio, int err)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct extent_io_tree *tree;
+        u64 start;
+        u64 end;
+        int whole_page;
+        int ret;
+        if (err)
+                uptodate = 0;
+        do {
+                struct page *page = bvec->bv_page;
+                tree = &BTRFS_I(page->mapping->host)->io_tree;
+                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+                        bvec->bv_offset;
+                end = start + bvec->bv_len - 1;
+                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+                        whole_page = 1;
+                else
+                        whole_page = 0;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+                        ret = tree->ops->readpage_end_io_hook(page, start, end,
+                                                              NULL);
+                        if (ret)
+                                uptodate = 0;
+                }
+                if (!uptodate && tree->ops &&
+                    tree->ops->readpage_io_failed_hook) {
+                        ret = tree->ops->readpage_io_failed_hook(bio, page,
+                                                         start, end, NULL);
+                        if (ret == 0) {
+                                uptodate =
+                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
+                                if (err)
+                                        uptodate = 0;
+                                continue;
+                        }
+                }
+                if (uptodate) {
+                        set_extent_uptodate(tree, start, end,
+                                            GFP_ATOMIC);
+                }
+                unlock_extent(tree, start, end, GFP_ATOMIC);
+                if (whole_page) {
+                        if (uptodate) {
+                                SetPageUptodate(page);
+                        } else {
+                                ClearPageUptodate(page);
+                                SetPageError(page);
+                        }
+                        unlock_page(page);
+                } else {
+                        if (uptodate) {
+                                check_page_uptodate(tree, page);
+                        } else {
+                                ClearPageUptodate(page);
+                                SetPageError(page);
+                        }
+                        check_page_locked(tree, page);
+                }
+        } while (bvec >= bio->bi_io_vec);
+        bio_put(bio);
+}
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+static void end_bio_extent_preparewrite(struct bio *bio, int err)
+{
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct extent_io_tree *tree;
+        u64 start;
+        u64 end;
+        do {
+                struct page *page = bvec->bv_page;
+                tree = &BTRFS_I(page->mapping->host)->io_tree;
+                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+                        bvec->bv_offset;
+                end = start + bvec->bv_len - 1;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (uptodate) {
+                        set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+                } else {
+                        ClearPageUptodate(page);
+                        SetPageError(page);
+                }
+                unlock_extent(tree, start, end, GFP_ATOMIC);
+        } while (bvec >= bio->bi_io_vec);
+        bio_put(bio);
+}
+static struct bio *
+extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+                 gfp_t gfp_flags)
+{
+        struct bio *bio;
+        bio = bio_alloc(gfp_flags, nr_vecs);
+        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+                while (!bio && (nr_vecs /= 2))
+                        bio = bio_alloc(gfp_flags, nr_vecs);
+        }
+        if (bio) {
+                bio->bi_size = 0;
+                bio->bi_bdev = bdev;
+                bio->bi_sector = first_sector;
+        }
+        return bio;
+}
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+                          unsigned long bio_flags)
+{
+        int ret = 0;
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct page *page = bvec->bv_page;
+        struct extent_io_tree *tree = bio->bi_private;
+        u64 start;
+        u64 end;
+        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+        end = start + bvec->bv_len - 1;
+        bio->bi_private = NULL;
+        bio_get(bio);
+        if (tree->ops && tree->ops->submit_bio_hook)
+                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+                                           mirror_num, bio_flags);
+        else
+                submit_bio(rw, bio);
+        if (bio_flagged(bio, BIO_EOPNOTSUPP))
+                ret = -EOPNOTSUPP;
+        bio_put(bio);
+        return ret;
+}
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+                              struct page *page, sector_t sector,
+                              size_t size, unsigned long offset,
+                              struct block_device *bdev,
+                              struct bio **bio_ret,
+                              unsigned long max_pages,
+                              bio_end_io_t end_io_func,
+                              int mirror_num,
+                              unsigned long prev_bio_flags,
+                              unsigned long bio_flags)
+{
+        int ret = 0;
+        struct bio *bio;
+        int nr;
+        int contig = 0;
+        int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+        int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+        size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+        if (bio_ret && *bio_ret) {
+                bio = *bio_ret;
+                if (old_compressed)
+                        contig = bio->bi_sector == sector;
+                else
+                        contig = bio->bi_sector + (bio->bi_size >> 9) ==
+                                sector;
+                if (prev_bio_flags != bio_flags || !contig ||
+                    (tree->ops && tree->ops->merge_bio_hook &&
+                     tree->ops->merge_bio_hook(page, offset, page_size, bio,
+                                               bio_flags)) ||
+                    bio_add_page(bio, page, page_size, offset) < page_size) {
+                        ret = submit_one_bio(rw, bio, mirror_num,
+                                             prev_bio_flags);
+                        bio = NULL;
+                } else {
+                        return 0;
+                }
+        }
+        if (this_compressed)
+                nr = BIO_MAX_PAGES;
+        else
+                nr = bio_get_nr_vecs(bdev);
+        bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+        bio_add_page(bio, page, page_size, offset);
+        bio->bi_end_io = end_io_func;
+        bio->bi_private = tree;
+        if (bio_ret)
+                *bio_ret = bio;
+        else
+                ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
+        return ret;
+}
+void set_page_extent_mapped(struct page *page)
+{
+        if (!PagePrivate(page)) {
+                SetPagePrivate(page);
+                page_cache_get(page);
+                set_page_private(page, EXTENT_PAGE_PRIVATE);
+        }
+}
+static void set_page_extent_head(struct page *page, unsigned long len)
+{
+        set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+}
+/*
+ * basic readpage implementation.  Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+static int __extent_read_full_page(struct extent_io_tree *tree,
+                                   struct page *page,
+                                   get_extent_t *get_extent,
+                                   struct bio **bio, int mirror_num,
+                                   unsigned long *bio_flags)
+{
+        struct inode *inode = page->mapping->host;
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 page_end = start + PAGE_CACHE_SIZE - 1;
+        u64 end;
+        u64 cur = start;
+        u64 extent_offset;
+        u64 last_byte = i_size_read(inode);
+        u64 block_start;
+        u64 cur_end;
+        sector_t sector;
+        struct extent_map *em;
+        struct block_device *bdev;
+        int ret;
+        int nr = 0;
+        size_t page_offset = 0;
+        size_t iosize;
+        size_t disk_io_size;
+        size_t blocksize = inode->i_sb->s_blocksize;
+        unsigned long this_bio_flag = 0;
+        set_page_extent_mapped(page);
+        end = page_end;
+        lock_extent(tree, start, end, GFP_NOFS);
+        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+                char *userpage;
+                size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+                if (zero_offset) {
+                        iosize = PAGE_CACHE_SIZE - zero_offset;
+                        userpage = kmap_atomic(page, KM_USER0);
+                        memset(userpage + zero_offset, 0, iosize);
+                        flush_dcache_page(page);
+                        kunmap_atomic(userpage, KM_USER0);
+                }
+        }
+        while (cur <= end) {
+                if (cur >= last_byte) {
+                        char *userpage;
+                        iosize = PAGE_CACHE_SIZE - page_offset;
+                        userpage = kmap_atomic(page, KM_USER0);
+                        memset(userpage + page_offset, 0, iosize);
+                        flush_dcache_page(page);
+                        kunmap_atomic(userpage, KM_USER0);
+                        set_extent_uptodate(tree, cur, cur + iosize - 1,
+                                            GFP_NOFS);
+                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        break;
+                }
+                em = get_extent(inode, page, page_offset, cur,
+                                end - cur + 1, 0);
+                if (IS_ERR(em) || !em) {
+                        SetPageError(page);
+                        unlock_extent(tree, cur, end, GFP_NOFS);
+                        break;
+                }
+                extent_offset = cur - em->start;
+                BUG_ON(extent_map_end(em) <= cur);
+                BUG_ON(end < cur);
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                        this_bio_flag = EXTENT_BIO_COMPRESSED;
+                iosize = min(extent_map_end(em) - cur, end - cur + 1);
+                cur_end = min(extent_map_end(em) - 1, end);
+                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+                        disk_io_size = em->block_len;
+                        sector = em->block_start >> 9;
+                } else {
+                        sector = (em->block_start + extent_offset) >> 9;
+                        disk_io_size = iosize;
+                }
+                bdev = em->bdev;
+                block_start = em->block_start;
+                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                        block_start = EXTENT_MAP_HOLE;
+                free_extent_map(em);
+                em = NULL;
+                /* we've found a hole, just zero and go on */
+                if (block_start == EXTENT_MAP_HOLE) {
+                        char *userpage;
+                        userpage = kmap_atomic(page, KM_USER0);
+                        memset(userpage + page_offset, 0, iosize);
+                        flush_dcache_page(page);
+                        kunmap_atomic(userpage, KM_USER0);
+                        set_extent_uptodate(tree, cur, cur + iosize - 1,
+                                            GFP_NOFS);
+                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        cur = cur + iosize;
+                        page_offset += iosize;
+                        continue;
+                }
+                /* the get_extent function already copied into the page */
+                if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+                        check_page_uptodate(tree, page);
+                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        cur = cur + iosize;
+                        page_offset += iosize;
+                        continue;
+                }
+                /* we have an inline extent but it didn't get marked up
+                 * to date.  Error out
+                 */
+                if (block_start == EXTENT_MAP_INLINE) {
+                        SetPageError(page);
+                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        cur = cur + iosize;
+                        page_offset += iosize;
+                        continue;
+                }
+                ret = 0;
+                if (tree->ops && tree->ops->readpage_io_hook) {
+                        ret = tree->ops->readpage_io_hook(page, cur,
+                                                          cur + iosize - 1);
+                }
+                if (!ret) {
+                        unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+                        pnr -= page->index;
+                        ret = submit_extent_page(READ, tree, page,
+                                         sector, disk_io_size, page_offset,
+                                         bdev, bio, pnr,
+                                         end_bio_extent_readpage, mirror_num,
+                                         *bio_flags,
+                                         this_bio_flag);
+                        nr++;
+                        *bio_flags = this_bio_flag;
+                }
+                if (ret)
+                        SetPageError(page);
+                cur = cur + iosize;
+                page_offset += iosize;
+        }
+        if (!nr) {
+                if (!PageError(page))
+                        SetPageUptodate(page);
+                unlock_page(page);
+        }
+        return 0;
+}
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+                            get_extent_t *get_extent)
+{
+        struct bio *bio = NULL;
+        unsigned long bio_flags = 0;
+        int ret;
+        ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+                                      &bio_flags);
+        if (bio)
+                submit_one_bio(READ, bio, 0, bio_flags);
+        return ret;
+}
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+                              void *data)
+{
+        struct inode *inode = page->mapping->host;
+        struct extent_page_data *epd = data;
+        struct extent_io_tree *tree = epd->tree;
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 delalloc_start;
+        u64 page_end = start + PAGE_CACHE_SIZE - 1;
+        u64 end;
+        u64 cur = start;
+        u64 extent_offset;
+        u64 last_byte = i_size_read(inode);
+        u64 block_start;
+        u64 iosize;
+        u64 unlock_start;
+        sector_t sector;
+        struct extent_map *em;
+        struct block_device *bdev;
+        int ret;
+        int nr = 0;
+        size_t pg_offset = 0;
+        size_t blocksize;
+        loff_t i_size = i_size_read(inode);
+        unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+        u64 nr_delalloc;
+        u64 delalloc_end;
+        int page_started;
+        int compressed;
+        unsigned long nr_written = 0;
+        WARN_ON(!PageLocked(page));
+        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+        if (page->index > end_index ||
+           (page->index == end_index && !pg_offset)) {
+                page->mapping->a_ops->invalidatepage(page, 0);
+                unlock_page(page);
+                return 0;
+        }
+        if (page->index == end_index) {
+                char *userpage;
+                userpage = kmap_atomic(page, KM_USER0);
+                memset(userpage + pg_offset, 0,
+                       PAGE_CACHE_SIZE - pg_offset);
+                kunmap_atomic(userpage, KM_USER0);
+                flush_dcache_page(page);
+        }
+        pg_offset = 0;
+        set_page_extent_mapped(page);
+        delalloc_start = start;
+        delalloc_end = 0;
+        page_started = 0;
+        if (!epd->extent_locked) {
+                while (delalloc_end < page_end) {
+                        nr_delalloc = find_lock_delalloc_range(inode, tree,
+                                                       page,
+                                                       &delalloc_start,
+                                                       &delalloc_end,
+                                                       128 * 1024 * 1024);
+                        if (nr_delalloc == 0) {
+                                delalloc_start = delalloc_end + 1;
+                                continue;
+                        }
+                        tree->ops->fill_delalloc(inode, page, delalloc_start,
+                                                 delalloc_end, &page_started,
+                                                 &nr_written);
+                        delalloc_start = delalloc_end + 1;
+                }
+                /* did the fill delalloc function already unlock and start
+                 * the IO?
+                 */
+                if (page_started) {
+                        ret = 0;
+                        goto update_nr_written;
+                }
+        }
+        lock_extent(tree, start, page_end, GFP_NOFS);
+        unlock_start = start;
+        if (tree->ops && tree->ops->writepage_start_hook) {
+                ret = tree->ops->writepage_start_hook(page, start,
+                                                      page_end);
+                if (ret == -EAGAIN) {
+                        unlock_extent(tree, start, page_end, GFP_NOFS);
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        ret = 0;
+                        goto update_nr_written;
+                }
+        }
+        nr_written++;
+        end = page_end;
+        if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+                printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
+        if (last_byte <= start) {
+                clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+                unlock_extent(tree, start, page_end, GFP_NOFS);
+                if (tree->ops && tree->ops->writepage_end_io_hook)
+                        tree->ops->writepage_end_io_hook(page, start,
+                                                         page_end, NULL, 1);
+                unlock_start = page_end + 1;
+                goto done;
+        }
+        set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+        blocksize = inode->i_sb->s_blocksize;
+        while (cur <= end) {
+                if (cur >= last_byte) {
+                        clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+                        unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+                        if (tree->ops && tree->ops->writepage_end_io_hook)
+                                tree->ops->writepage_end_io_hook(page, cur,
+                                                         page_end, NULL, 1);
+                        unlock_start = page_end + 1;
+                        break;
+                }
+                em = epd->get_extent(inode, page, pg_offset, cur,
+                                     end - cur + 1, 1);
+                if (IS_ERR(em) || !em) {
+                        SetPageError(page);
+                        break;
+                }
+                extent_offset = cur - em->start;
+                BUG_ON(extent_map_end(em) <= cur);
+                BUG_ON(end < cur);
+                iosize = min(extent_map_end(em) - cur, end - cur + 1);
+                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+                sector = (em->block_start + extent_offset) >> 9;
+                bdev = em->bdev;
+                block_start = em->block_start;
+                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                free_extent_map(em);
+                em = NULL;
+                /*
+                 * compressed and inline extents are written through other
+                 * paths in the FS
+                 */
+                if (compressed || block_start == EXTENT_MAP_HOLE ||
+                    block_start == EXTENT_MAP_INLINE) {
+                        clear_extent_dirty(tree, cur,
+                                           cur + iosize - 1, GFP_NOFS);
+                        unlock_extent(tree, unlock_start, cur + iosize - 1,
+                                      GFP_NOFS);
+                        /*
+                         * end_io notification does not happen here for
+                         * compressed extents
+                         */
+                        if (!compressed && tree->ops &&
+                            tree->ops->writepage_end_io_hook)
+                                tree->ops->writepage_end_io_hook(page, cur,
+                                                         cur + iosize - 1,
+                                                         NULL, 1);
+                        else if (compressed) {
+                                /* we don't want to end_page_writeback on
+                                 * a compressed extent.  this happens
+                                 * elsewhere
+                                 */
+                                nr++;
+                        }
+                        cur += iosize;
+                        pg_offset += iosize;
+                        unlock_start = cur;
+                        continue;
+                }
+                /* leave this out until we have a page_mkwrite call */
+                if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+                                   EXTENT_DIRTY, 0)) {
+                        cur = cur + iosize;
+                        pg_offset += iosize;
+                        continue;
+                }
+                clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+                if (tree->ops && tree->ops->writepage_io_hook) {
+                        ret = tree->ops->writepage_io_hook(page, cur,
+                                                cur + iosize - 1);
+                } else {
+                        ret = 0;
+                }
+                if (ret) {
+                        SetPageError(page);
+                } else {
+                        unsigned long max_nr = end_index + 1;
+                        set_range_writeback(tree, cur, cur + iosize - 1);
+                        if (!PageWriteback(page)) {
+                                printk(KERN_ERR "btrfs warning page %lu not "
+                                       "writeback, cur %llu end %llu\n",
+                                       page->index, (unsigned long long)cur,
+                                       (unsigned long long)end);
+                        }
+                        ret = submit_extent_page(WRITE, tree, page, sector,
+                                                 iosize, pg_offset, bdev,
+                                                 &epd->bio, max_nr,
+                                                 end_bio_extent_writepage,
+                                                 0, 0, 0);
+                        if (ret)
+                                SetPageError(page);
+                }
+                cur = cur + iosize;
+                pg_offset += iosize;
+                nr++;
+        }
+done:
+        if (nr == 0) {
+                /* make sure the mapping tag for page dirty gets cleared */
+                set_page_writeback(page);
+                end_page_writeback(page);
+        }
+        if (unlock_start <= page_end)
+                unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+        unlock_page(page);
+update_nr_written:
+        wbc->nr_to_write -= nr_written;
+        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+                page->mapping->writeback_index = page->index + nr_written;
+        return 0;
+}
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int extent_write_cache_pages(struct extent_io_tree *tree,
+                             struct address_space *mapping,
+                             struct writeback_control *wbc,
+                             writepage_t writepage, void *data,
+                             void (*flush_fn)(void *))
+{
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        int ret = 0;
+        int done = 0;
+        struct pagevec pvec;
+        int nr_pages;
+        pgoff_t index;
+        pgoff_t end;            /* Inclusive */
+        int scanned = 0;
+        int range_whole = 0;
+        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                wbc->encountered_congestion = 1;
+                return 0;
+        }
+        pagevec_init(&pvec, 0);
+        if (wbc->range_cyclic) {
+                index = mapping->writeback_index; /* Start from prev offset */
+                end = -1;
+        } else {
+                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                        range_whole = 1;
+                scanned = 1;
+        }
+retry:
+        while (!done && (index <= end) &&
+               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                              PAGECACHE_TAG_DIRTY, min(end - index,
+                                  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+                unsigned i;
+                scanned = 1;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /*
+                         * At this point we hold neither mapping->tree_lock nor
+                         * lock on the page itself: the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or even
+                         * swizzled back from swapper_space to tmpfs file
+                         * mapping
+                         */
+                        if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+                                tree->ops->write_cache_pages_lock_hook(page);
+                        else
+                                lock_page(page);
+                        if (unlikely(page->mapping != mapping)) {
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (!wbc->range_cyclic && page->index > end) {
+                                done = 1;
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (wbc->sync_mode != WB_SYNC_NONE) {
+                                if (PageWriteback(page))
+                                        flush_fn(data);
+                                wait_on_page_writeback(page);
+                        }
+                        if (PageWriteback(page) ||
+                            !clear_page_dirty_for_io(page)) {
+                                unlock_page(page);
+                                continue;
+                        }
+                        ret = (*writepage)(page, wbc, data);
+                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+                                unlock_page(page);
+                                ret = 0;
+                        }
+                        if (ret || wbc->nr_to_write <= 0)
+                                done = 1;
+                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                                wbc->encountered_congestion = 1;
+                                done = 1;
+                        }
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        if (!scanned && !done) {
+                /*
+                 * We hit the last page and there is more work to be done: wrap
+                 * back to the start of the file
+                 */
+                scanned = 1;
+                index = 0;
+                goto retry;
+        }
+        return ret;
+}
+static noinline void flush_write_bio(void *data)
+{
+        struct extent_page_data *epd = data;
+        if (epd->bio) {
+                submit_one_bio(WRITE, epd->bio, 0, 0);
+                epd->bio = NULL;
+        }
+}
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+                          get_extent_t *get_extent,
+                          struct writeback_control *wbc)
+{
+        int ret;
+        struct address_space *mapping = page->mapping;
+        struct extent_page_data epd = {
+                .bio = NULL,
+                .tree = tree,
+                .get_extent = get_extent,
+                .extent_locked = 0,
+        };
+        struct writeback_control wbc_writepages = {
+                .bdi            = wbc->bdi,
+                .sync_mode      = WB_SYNC_NONE,
+                .older_than_this = NULL,
+                .nr_to_write    = 64,
+                .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
+                .range_end      = (loff_t)-1,
+        };
+        ret = __extent_writepage(page, wbc, &epd);
+        extent_write_cache_pages(tree, mapping, &wbc_writepages,
+                                 __extent_writepage, &epd, flush_write_bio);
+        if (epd.bio)
+                submit_one_bio(WRITE, epd.bio, 0, 0);
+        return ret;
+}
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+                              u64 start, u64 end, get_extent_t *get_extent,
+                              int mode)
+{
+        int ret = 0;
+        struct address_space *mapping = inode->i_mapping;
+        struct page *page;
+        unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+                PAGE_CACHE_SHIFT;
+        struct extent_page_data epd = {
+                .bio = NULL,
+                .tree = tree,
+                .get_extent = get_extent,
+                .extent_locked = 1,
+        };
+        struct writeback_control wbc_writepages = {
+                .bdi            = inode->i_mapping->backing_dev_info,
+                .sync_mode      = mode,
+                .older_than_this = NULL,
+                .nr_to_write    = nr_pages * 2,
+                .range_start    = start,
+                .range_end      = end + 1,
+        };
+        while (start <= end) {
+                page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+                if (clear_page_dirty_for_io(page))
+                        ret = __extent_writepage(page, &wbc_writepages, &epd);
+                else {
+                        if (tree->ops && tree->ops->writepage_end_io_hook)
+                                tree->ops->writepage_end_io_hook(page, start,
+                                                 start + PAGE_CACHE_SIZE - 1,
+                                                 NULL, 1);
+                        unlock_page(page);
+                }
+                page_cache_release(page);
+                start += PAGE_CACHE_SIZE;
+        }
+        if (epd.bio)
+                submit_one_bio(WRITE, epd.bio, 0, 0);
+        return ret;
+}
+int extent_writepages(struct extent_io_tree *tree,
+                      struct address_space *mapping,
+                      get_extent_t *get_extent,
+                      struct writeback_control *wbc)
+{
+        int ret = 0;
+        struct extent_page_data epd = {
+                .bio = NULL,
+                .tree = tree,
+                .get_extent = get_extent,
+                .extent_locked = 0,
+        };
+        ret = extent_write_cache_pages(tree, mapping, wbc,
+                                       __extent_writepage, &epd,
+                                       flush_write_bio);
+        if (epd.bio)
+                submit_one_bio(WRITE, epd.bio, 0, 0);
+        return ret;
+}
+int extent_readpages(struct extent_io_tree *tree,
+                     struct address_space *mapping,
+                     struct list_head *pages, unsigned nr_pages,
+                     get_extent_t get_extent)
+{
+        struct bio *bio = NULL;
+        unsigned page_idx;
+        struct pagevec pvec;
+        unsigned long bio_flags = 0;
+        pagevec_init(&pvec, 0);
+        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+                struct page *page = list_entry(pages->prev, struct page, lru);
+                prefetchw(&page->flags);
+                list_del(&page->lru);
+                /*
+                 * what we want to do here is call add_to_page_cache_lru,
+                 * but that isn't exported, so we reproduce it here
+                 */
+                if (!add_to_page_cache(page, mapping,
+                                        page->index, GFP_KERNEL)) {
+                        /* open coding of lru_cache_add, also not exported */
+                        page_cache_get(page);
+                        if (!pagevec_add(&pvec, page))
+                                __pagevec_lru_add_file(&pvec);
+                        __extent_read_full_page(tree, page, get_extent,
+                                                &bio, 0, &bio_flags);
+                }
+                page_cache_release(page);
+        }
+        if (pagevec_count(&pvec))
+                __pagevec_lru_add_file(&pvec);
+        BUG_ON(!list_empty(pages));
+        if (bio)
+                submit_one_bio(READ, bio, 0, bio_flags);
+        return 0;
+}
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+                          struct page *page, unsigned long offset)
+{
+        u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+        start += (offset + blocksize - 1) & ~(blocksize - 1);
+        if (start > end)
+                return 0;
+        lock_extent(tree, start, end, GFP_NOFS);
+        wait_on_extent_writeback(tree, start, end);
+        clear_extent_bit(tree, start, end,
+                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+                         1, 1, GFP_NOFS);
+        return 0;
+}
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_io_tree *tree,
+                        struct inode *inode, struct page *page,
+                        unsigned from, unsigned to)
+{
+        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+        set_page_extent_mapped(page);
+        set_page_dirty(page);
+        if (pos > inode->i_size) {
+                i_size_write(inode, pos);
+                mark_inode_dirty(inode);
+        }
+        return 0;
+}
+int extent_prepare_write(struct extent_io_tree *tree,
+                         struct inode *inode, struct page *page,
+                         unsigned from, unsigned to, get_extent_t *get_extent)
+{
+        u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+        u64 block_start;
+        u64 orig_block_start;
+        u64 block_end;
+        u64 cur_end;
+        struct extent_map *em;
+        unsigned blocksize = 1 << inode->i_blkbits;
+        size_t page_offset = 0;
+        size_t block_off_start;
+        size_t block_off_end;
+        int err = 0;
+        int iocount = 0;
+        int ret = 0;
+        int isnew;
+        set_page_extent_mapped(page);
+        block_start = (page_start + from) & ~((u64)blocksize - 1);
+        block_end = (page_start + to - 1) | (blocksize - 1);
+        orig_block_start = block_start;
+        lock_extent(tree, page_start, page_end, GFP_NOFS);
+        while (block_start <= block_end) {
+                em = get_extent(inode, page, page_offset, block_start,
+                                block_end - block_start + 1, 1);
+                if (IS_ERR(em) || !em)
+                        goto err;
+                cur_end = min(block_end, extent_map_end(em) - 1);
+                block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+                block_off_end = block_off_start + blocksize;
+                isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+                if (!PageUptodate(page) && isnew &&
+                    (block_off_end > to || block_off_start < from)) {
+                        void *kaddr;
+                        kaddr = kmap_atomic(page, KM_USER0);
+                        if (block_off_end > to)
+                                memset(kaddr + to, 0, block_off_end - to);
+                        if (block_off_start < from)
+                                memset(kaddr + block_off_start, 0,
+                                       from - block_off_start);
+                        flush_dcache_page(page);
+                        kunmap_atomic(kaddr, KM_USER0);
+                }
+                if ((em->block_start != EXTENT_MAP_HOLE &&
+                     em->block_start != EXTENT_MAP_INLINE) &&
+                    !isnew && !PageUptodate(page) &&
+                    (block_off_end > to || block_off_start < from) &&
+                    !test_range_bit(tree, block_start, cur_end,
+                                    EXTENT_UPTODATE, 1)) {
+                        u64 sector;
+                        u64 extent_offset = block_start - em->start;
+                        size_t iosize;
+                        sector = (em->block_start + extent_offset) >> 9;
+                        iosize = (cur_end - block_start + blocksize) &
+                                ~((u64)blocksize - 1);
+                        /*
+                         * we've already got the extent locked, but we
+                         * need to split the state such that our end_bio
+                         * handler can clear the lock.
+                         */
+                        set_extent_bit(tree, block_start,
+                                       block_start + iosize - 1,
+                                       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+                        ret = submit_extent_page(READ, tree, page,
+                                         sector, iosize, page_offset, em->bdev,
+                                         NULL, 1,
+                                         end_bio_extent_preparewrite, 0,
+                                         0, 0);
+                        iocount++;
+                        block_start = block_start + iosize;
+                } else {
+                        set_extent_uptodate(tree, block_start, cur_end,
+                                            GFP_NOFS);
+                        unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+                        block_start = cur_end + 1;
+                }
+                page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+                free_extent_map(em);
+        }
+        if (iocount) {
+                wait_extent_bit(tree, orig_block_start,
+                                block_end, EXTENT_LOCKED);
+        }
+        check_page_uptodate(tree, page);
+err:
+        /* FIXME, zero out newly allocated blocks on error */
+        return err;
+}
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+int try_release_extent_state(struct extent_map_tree *map,
+                             struct extent_io_tree *tree, struct page *page,
+                             gfp_t mask)
+{
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        int ret = 1;
+        if (test_range_bit(tree, start, end,
+                           EXTENT_IOBITS | EXTENT_ORDERED, 0))
+                ret = 0;
+        else {
+                if ((mask & GFP_NOFS) == GFP_NOFS)
+                        mask = GFP_NOFS;
+                clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+                                 1, 1, mask);
+        }
+        return ret;
+}
+/*
+ * a helper for releasepage.  As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+                               struct extent_io_tree *tree, struct page *page,
+                               gfp_t mask)
+{
+        struct extent_map *em;
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        if ((mask & __GFP_WAIT) &&
+            page->mapping->host->i_size > 16 * 1024 * 1024) {
+                u64 len;
+                while (start <= end) {
+                        len = end - start + 1;
+                        spin_lock(&map->lock);
+                        em = lookup_extent_mapping(map, start, len);
+                        if (!em || IS_ERR(em)) {
+                                spin_unlock(&map->lock);
+                                break;
+                        }
+                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+                            em->start != start) {
+                                spin_unlock(&map->lock);
+                                free_extent_map(em);
+                                break;
+                        }
+                        if (!test_range_bit(tree, em->start,
+                                            extent_map_end(em) - 1,
+                                            EXTENT_LOCKED | EXTENT_WRITEBACK |
+                                            EXTENT_ORDERED,
+                                            0)) {
+                                remove_extent_mapping(map, em);
+                                /* once for the rb tree */
+                                free_extent_map(em);
+                        }
+                        start = extent_map_end(em);
+                        spin_unlock(&map->lock);
+                        /* once for us */
+                        free_extent_map(em);
+                }
+        }
+        return try_release_extent_state(map, tree, page, mask);
+}
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+                get_extent_t *get_extent)
+{
+        struct inode *inode = mapping->host;
+        u64 start = iblock << inode->i_blkbits;
+        sector_t sector = 0;
+        size_t blksize = (1 << inode->i_blkbits);
+        struct extent_map *em;
+        lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+                    GFP_NOFS);
+        em = get_extent(inode, NULL, 0, start, blksize, 0);
+        unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+                      GFP_NOFS);
+        if (!em || IS_ERR(em))
+                return 0;
+        if (em->block_start > EXTENT_MAP_LAST_BYTE)
+                goto out;
+        sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+out:
+        free_extent_map(em);
+        return sector;
+}
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+                                              unsigned long i)
+{
+        struct page *p;
+        struct address_space *mapping;
+        if (i == 0)
+                return eb->first_page;
+        i += eb->start >> PAGE_CACHE_SHIFT;
+        mapping = eb->first_page->mapping;
+        if (!mapping)
+                return NULL;
+        /*
+         * extent_buffer_page is only called after pinning the page
+         * by increasing the reference count.  So we know the page must
+         * be in the radix tree.
+         */
+        rcu_read_lock();
+        p = radix_tree_lookup(&mapping->page_tree, i);
+        rcu_read_unlock();
+        return p;
+}
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+        return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+                (start >> PAGE_CACHE_SHIFT);
+}
+static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
+                                                   u64 start,
+                                                   unsigned long len,
+                                                   gfp_t mask)
+{
+        struct extent_buffer *eb = NULL;
+#ifdef LEAK_DEBUG
+        unsigned long flags;
+#endif
+        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+        eb->start = start;
+        eb->len = len;
+        mutex_init(&eb->mutex);
+#ifdef LEAK_DEBUG
+        spin_lock_irqsave(&leak_lock, flags);
+        list_add(&eb->leak_list, &buffers);
+        spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+        atomic_set(&eb->refs, 1);
+        return eb;
+}
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#ifdef LEAK_DEBUG
+        unsigned long flags;
+        spin_lock_irqsave(&leak_lock, flags);
+        list_del(&eb->leak_list);
+        spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+        kmem_cache_free(extent_buffer_cache, eb);
+}
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+                                          u64 start, unsigned long len,
+                                          struct page *page0,
+                                          gfp_t mask)
+{
+        unsigned long num_pages = num_extent_pages(start, len);
+        unsigned long i;
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        struct extent_buffer *eb;
+        struct extent_buffer *exists = NULL;
+        struct page *p;
+        struct address_space *mapping = tree->mapping;
+        int uptodate = 1;
+        spin_lock(&tree->buffer_lock);
+        eb = buffer_search(tree, start);
+        if (eb) {
+                atomic_inc(&eb->refs);
+                spin_unlock(&tree->buffer_lock);
+                mark_page_accessed(eb->first_page);
+                return eb;
+        }
+        spin_unlock(&tree->buffer_lock);
+        eb = __alloc_extent_buffer(tree, start, len, mask);
+        if (!eb)
+                return NULL;
+        if (page0) {
+                eb->first_page = page0;
+                i = 1;
+                index++;
+                page_cache_get(page0);
+                mark_page_accessed(page0);
+                set_page_extent_mapped(page0);
+                set_page_extent_head(page0, len);
+                uptodate = PageUptodate(page0);
+        } else {
+                i = 0;
+        }
+        for (; i < num_pages; i++, index++) {
+                p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+                if (!p) {
+                        WARN_ON(1);
+                        goto free_eb;
+                }
+                set_page_extent_mapped(p);
+                mark_page_accessed(p);
+                if (i == 0) {
+                        eb->first_page = p;
+                        set_page_extent_head(p, len);
+                } else {
+                        set_page_private(p, EXTENT_PAGE_PRIVATE);
+                }
+                if (!PageUptodate(p))
+                        uptodate = 0;
+                unlock_page(p);
+        }
+        if (uptodate)
+                eb->flags |= EXTENT_UPTODATE;
+        eb->flags |= EXTENT_BUFFER_FILLED;
+        spin_lock(&tree->buffer_lock);
+        exists = buffer_tree_insert(tree, start, &eb->rb_node);
+        if (exists) {
+                /* add one reference for the caller */
+                atomic_inc(&exists->refs);
+                spin_unlock(&tree->buffer_lock);
+                goto free_eb;
+        }
+        spin_unlock(&tree->buffer_lock);
+        /* add one reference for the tree */
+        atomic_inc(&eb->refs);
+        return eb;
+free_eb:
+        if (!atomic_dec_and_test(&eb->refs))
+                return exists;
+        for (index = 1; index < i; index++)
+                page_cache_release(extent_buffer_page(eb, index));
+        page_cache_release(extent_buffer_page(eb, 0));
+        __free_extent_buffer(eb);
+        return exists;
+}
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+                                         u64 start, unsigned long len,
+                                          gfp_t mask)
+{
+        struct extent_buffer *eb;
+        spin_lock(&tree->buffer_lock);
+        eb = buffer_search(tree, start);
+        if (eb)
+                atomic_inc(&eb->refs);
+        spin_unlock(&tree->buffer_lock);
+        if (eb)
+                mark_page_accessed(eb->first_page);
+        return eb;
+}
+void free_extent_buffer(struct extent_buffer *eb)
+{
+        if (!eb)
+                return;
+        if (!atomic_dec_and_test(&eb->refs))
+                return;
+        WARN_ON(1);
+}
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+                              struct extent_buffer *eb)
+{
+        int set;
+        unsigned long i;
+        unsigned long num_pages;
+        struct page *page;
+        u64 start = eb->start;
+        u64 end = start + eb->len - 1;
+        set = clear_extent_dirty(tree, start, end, GFP_NOFS);
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = 0; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (!set && !PageDirty(page))
+                        continue;
+                lock_page(page);
+                if (i == 0)
+                        set_page_extent_head(page, eb->len);
+                else
+                        set_page_private(page, EXTENT_PAGE_PRIVATE);
+                /*
+                 * if we're on the last page or the first page and the
+                 * block isn't aligned on a page boundary, do extra checks
+                 * to make sure we don't clean page that is partially dirty
+                 */
+                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+                    ((i == num_pages - 1) &&
+                     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+                        start = (u64)page->index << PAGE_CACHE_SHIFT;
+                        end  = start + PAGE_CACHE_SIZE - 1;
+                        if (test_range_bit(tree, start, end,
+                                           EXTENT_DIRTY, 0)) {
+                                unlock_page(page);
+                                continue;
+                        }
+                }
+                clear_page_dirty_for_io(page);
+                spin_lock_irq(&page->mapping->tree_lock);
+                if (!PageDirty(page)) {
+                        radix_tree_tag_clear(&page->mapping->page_tree,
+                                                page_index(page),
+                                                PAGECACHE_TAG_DIRTY);
+                }
+                spin_unlock_irq(&page->mapping->tree_lock);
+                unlock_page(page);
+        }
+        return 0;
+}
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+                                    struct extent_buffer *eb)
+{
+        return wait_on_extent_writeback(tree, eb->start,
+                                        eb->start + eb->len - 1);
+}
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+                             struct extent_buffer *eb)
+{
+        unsigned long i;
+        unsigned long num_pages;
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = 0; i < num_pages; i++) {
+                struct page *page = extent_buffer_page(eb, i);
+                /* writepage may need to do something special for the
+                 * first page, we have to make sure page->private is
+                 * properly set.  releasepage may drop page->private
+                 * on us if the page isn't already dirty.
+                 */
+                lock_page(page);
+                if (i == 0) {
+                        set_page_extent_head(page, eb->len);
+                } else if (PagePrivate(page) &&
+                           page->private != EXTENT_PAGE_PRIVATE) {
+                        set_page_extent_mapped(page);
+                }
+                __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+                set_extent_dirty(tree, page_offset(page),
+                                 page_offset(page) + PAGE_CACHE_SIZE - 1,
+                                 GFP_NOFS);
+                unlock_page(page);
+        }
+        return 0;
+}
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+                                struct extent_buffer *eb)
+{
+        unsigned long i;
+        struct page *page;
+        unsigned long num_pages;
+        num_pages = num_extent_pages(eb->start, eb->len);
+        eb->flags &= ~EXTENT_UPTODATE;
+        clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                              GFP_NOFS);
+        for (i = 0; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (page)
+                        ClearPageUptodate(page);
+        }
+        return 0;
+}
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+                                struct extent_buffer *eb)
+{
+        unsigned long i;
+        struct page *page;
+        unsigned long num_pages;
+        num_pages = num_extent_pages(eb->start, eb->len);
+        set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                            GFP_NOFS);
+        for (i = 0; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+                    ((i == num_pages - 1) &&
+                     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+                        check_page_uptodate(tree, page);
+                        continue;
+                }
+                SetPageUptodate(page);
+        }
+        return 0;
+}
+int extent_range_uptodate(struct extent_io_tree *tree,
+                          u64 start, u64 end)
+{
+        struct page *page;
+        int ret;
+        int pg_uptodate = 1;
+        int uptodate;
+        unsigned long index;
+        ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+        if (ret)
+                return 1;
+        while (start <= end) {
+                index = start >> PAGE_CACHE_SHIFT;
+                page = find_get_page(tree->mapping, index);
+                uptodate = PageUptodate(page);
+                page_cache_release(page);
+                if (!uptodate) {
+                        pg_uptodate = 0;
+                        break;
+                }
+                start += PAGE_CACHE_SIZE;
+        }
+        return pg_uptodate;
+}
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+                           struct extent_buffer *eb)
+{
+        int ret = 0;
+        unsigned long num_pages;
+        unsigned long i;
+        struct page *page;
+        int pg_uptodate = 1;
+        if (eb->flags & EXTENT_UPTODATE)
+                return 1;
+        ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+                           EXTENT_UPTODATE, 1);
+        if (ret)
+                return ret;
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = 0; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (!PageUptodate(page)) {
+                        pg_uptodate = 0;
+                        break;
+                }
+        }
+        return pg_uptodate;
+}
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+                             struct extent_buffer *eb,
+                             u64 start, int wait,
+                             get_extent_t *get_extent, int mirror_num)
+{
+        unsigned long i;
+        unsigned long start_i;
+        struct page *page;
+        int err;
+        int ret = 0;
+        int locked_pages = 0;
+        int all_uptodate = 1;
+        int inc_all_pages = 0;
+        unsigned long num_pages;
+        struct bio *bio = NULL;
+        unsigned long bio_flags = 0;
+        if (eb->flags & EXTENT_UPTODATE)
+                return 0;
+        if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+                           EXTENT_UPTODATE, 1)) {
+                return 0;
+        }
+        if (start) {
+                WARN_ON(start < eb->start);
+                start_i = (start >> PAGE_CACHE_SHIFT) -
+                        (eb->start >> PAGE_CACHE_SHIFT);
+        } else {
+                start_i = 0;
+        }
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = start_i; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (!wait) {
+                        if (!trylock_page(page))
+                                goto unlock_exit;
+                } else {
+                        lock_page(page);
+                }
+                locked_pages++;
+                if (!PageUptodate(page))
+                        all_uptodate = 0;
+        }
+        if (all_uptodate) {
+                if (start_i == 0)
+                        eb->flags |= EXTENT_UPTODATE;
+                goto unlock_exit;
+        }
+        for (i = start_i; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (inc_all_pages)
+                        page_cache_get(page);
+                if (!PageUptodate(page)) {
+                        if (start_i == 0)
+                                inc_all_pages = 1;
+                        ClearPageError(page);
+                        err = __extent_read_full_page(tree, page,
+                                                      get_extent, &bio,
+                                                      mirror_num, &bio_flags);
+                        if (err)
+                                ret = err;
+                } else {
+                        unlock_page(page);
+                }
+        }
+        if (bio)
+                submit_one_bio(READ, bio, mirror_num, bio_flags);
+        if (ret || !wait)
+                return ret;
+        for (i = start_i; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                wait_on_page_locked(page);
+                if (!PageUptodate(page))
+                        ret = -EIO;
+        }
+        if (!ret)
+                eb->flags |= EXTENT_UPTODATE;
+        return ret;
+unlock_exit:
+        i = start_i;
+        while (locked_pages > 0) {
+                page = extent_buffer_page(eb, i);
+                i++;
+                unlock_page(page);
+                locked_pages--;
+        }
+        return ret;
+}
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+                        unsigned long start,
+                        unsigned long len)
+{
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        char *dst = (char *)dstv;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        WARN_ON(start > eb->len);
+        WARN_ON(start + len > eb->start + eb->len);
+        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(eb, i);
+                cur = min(len, (PAGE_CACHE_SIZE - offset));
+                kaddr = kmap_atomic(page, KM_USER1);
+                memcpy(dst, kaddr + offset, cur);
+                kunmap_atomic(kaddr, KM_USER1);
+                dst += cur;
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+}
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+                               unsigned long min_len, char **token, char **map,
+                               unsigned long *map_start,
+                               unsigned long *map_len, int km)
+{
+        size_t offset = start & (PAGE_CACHE_SIZE - 1);
+        char *kaddr;
+        struct page *p;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        unsigned long end_i = (start_offset + start + min_len - 1) >>
+                PAGE_CACHE_SHIFT;
+        if (i != end_i)
+                return -EINVAL;
+        if (i == 0) {
+                offset = start_offset;
+                *map_start = 0;
+        } else {
+                offset = 0;
+                *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+        }
+        if (start + min_len > eb->len) {
+                printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+                       "wanted %lu %lu\n", (unsigned long long)eb->start,
+                       eb->len, start, min_len);
+                WARN_ON(1);
+        }
+        p = extent_buffer_page(eb, i);
+        kaddr = kmap_atomic(p, km);
+        *token = kaddr;
+        *map = kaddr + offset;
+        *map_len = PAGE_CACHE_SIZE - offset;
+        return 0;
+}
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+                      unsigned long min_len,
+                      char **token, char **map,
+                      unsigned long *map_start,
+                      unsigned long *map_len, int km)
+{
+        int err;
+        int save = 0;
+        if (eb->map_token) {
+                unmap_extent_buffer(eb, eb->map_token, km);
+                eb->map_token = NULL;
+                save = 1;
+                WARN_ON(!mutex_is_locked(&eb->mutex));
+        }
+        err = map_private_extent_buffer(eb, start, min_len, token, map,
+                                       map_start, map_len, km);
+        if (!err && save) {
+                eb->map_token = *token;
+                eb->kaddr = *map;
+                eb->map_start = *map_start;
+                eb->map_len = *map_len;
+        }
+        return err;
+}
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
+{
+        kunmap_atomic(token, km);
+}
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+                          unsigned long start,
+                          unsigned long len)
+{
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        char *ptr = (char *)ptrv;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        int ret = 0;
+        WARN_ON(start > eb->len);
+        WARN_ON(start + len > eb->start + eb->len);
+        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(eb, i);
+                cur = min(len, (PAGE_CACHE_SIZE - offset));
+                kaddr = kmap_atomic(page, KM_USER0);
+                ret = memcmp(ptr, kaddr + offset, cur);
+                kunmap_atomic(kaddr, KM_USER0);
+                if (ret)
+                        break;
+                ptr += cur;
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+        return ret;
+}
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+                         unsigned long start, unsigned long len)
+{
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        char *src = (char *)srcv;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        WARN_ON(start > eb->len);
+        WARN_ON(start + len > eb->start + eb->len);
+        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(eb, i);
+                WARN_ON(!PageUptodate(page));
+                cur = min(len, PAGE_CACHE_SIZE - offset);
+                kaddr = kmap_atomic(page, KM_USER1);
+                memcpy(kaddr + offset, src, cur);
+                kunmap_atomic(kaddr, KM_USER1);
+                src += cur;
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+}
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+                          unsigned long start, unsigned long len)
+{
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        WARN_ON(start > eb->len);
+        WARN_ON(start + len > eb->start + eb->len);
+        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(eb, i);
+                WARN_ON(!PageUptodate(page));
+                cur = min(len, PAGE_CACHE_SIZE - offset);
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr + offset, c, cur);
+                kunmap_atomic(kaddr, KM_USER0);
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+}
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+                        unsigned long dst_offset, unsigned long src_offset,
+                        unsigned long len)
+{
+        u64 dst_len = dst->len;
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+        WARN_ON(src->len != dst_len);
+        offset = (start_offset + dst_offset) &
+                ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(dst, i);
+                WARN_ON(!PageUptodate(page));
+                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+                kaddr = kmap_atomic(page, KM_USER0);
+                read_extent_buffer(src, kaddr + offset, src_offset, cur);
+                kunmap_atomic(kaddr, KM_USER0);
+                src_offset += cur;
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+}
+static void move_pages(struct page *dst_page, struct page *src_page,
+                       unsigned long dst_off, unsigned long src_off,
+                       unsigned long len)
+{
+        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+        if (dst_page == src_page) {
+                memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
+        } else {
+                char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+                char *p = dst_kaddr + dst_off + len;
+                char *s = src_kaddr + src_off + len;
+                while (len--)
+                        *--p = *--s;
+                kunmap_atomic(src_kaddr, KM_USER1);
+        }
+        kunmap_atomic(dst_kaddr, KM_USER0);
+}
+static void copy_pages(struct page *dst_page, struct page *src_page,
+                       unsigned long dst_off, unsigned long src_off,
+                       unsigned long len)
+{
+        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+        char *src_kaddr;
+        if (dst_page != src_page)
+                src_kaddr = kmap_atomic(src_page, KM_USER1);
+        else
+                src_kaddr = dst_kaddr;
+        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+        kunmap_atomic(dst_kaddr, KM_USER0);
+        if (dst_page != src_page)
+                kunmap_atomic(src_kaddr, KM_USER1);
+}
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+                           unsigned long src_offset, unsigned long len)
+{
+        size_t cur;
+        size_t dst_off_in_page;
+        size_t src_off_in_page;
+        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long dst_i;
+        unsigned long src_i;
+        if (src_offset + len > dst->len) {
+                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+                       "len %lu dst len %lu\n", src_offset, len, dst->len);
+                BUG_ON(1);
+        }
+        if (dst_offset + len > dst->len) {
+                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+                       "len %lu dst len %lu\n", dst_offset, len, dst->len);
+                BUG_ON(1);
+        }
+        while (len > 0) {
+                dst_off_in_page = (start_offset + dst_offset) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+                src_off_in_page = (start_offset + src_offset) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+                dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+                src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+                                               src_off_in_page));
+                cur = min_t(unsigned long, cur,
+                        (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+                copy_pages(extent_buffer_page(dst, dst_i),
+                           extent_buffer_page(dst, src_i),
+                           dst_off_in_page, src_off_in_page, cur);
+                src_offset += cur;
+                dst_offset += cur;
+                len -= cur;
+        }
+}
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+                           unsigned long src_offset, unsigned long len)
+{
+        size_t cur;
+        size_t dst_off_in_page;
+        size_t src_off_in_page;
+        unsigned long dst_end = dst_offset + len - 1;
+        unsigned long src_end = src_offset + len - 1;
+        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long dst_i;
+        unsigned long src_i;
+        if (src_offset + len > dst->len) {
+                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+                       "len %lu len %lu\n", src_offset, len, dst->len);
+                BUG_ON(1);
+        }
+        if (dst_offset + len > dst->len) {
+                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+                       "len %lu len %lu\n", dst_offset, len, dst->len);
+                BUG_ON(1);
+        }
+        if (dst_offset < src_offset) {
+                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+                return;
+        }
+        while (len > 0) {
+                dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+                src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+                dst_off_in_page = (start_offset + dst_end) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+                src_off_in_page = (start_offset + src_end) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+                cur = min_t(unsigned long, len, src_off_in_page + 1);
+                cur = min(cur, dst_off_in_page + 1);
+                move_pages(extent_buffer_page(dst, dst_i),
+                           extent_buffer_page(dst, src_i),
+                           dst_off_in_page - cur + 1,
+                           src_off_in_page - cur + 1, cur);
+                dst_end -= cur;
+                src_end -= cur;
+                len -= cur;
+        }
+}
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+{
+        u64 start = page_offset(page);
+        struct extent_buffer *eb;
+        int ret = 1;
+        unsigned long i;
+        unsigned long num_pages;
+        spin_lock(&tree->buffer_lock);
+        eb = buffer_search(tree, start);
+        if (!eb)
+                goto out;
+        if (atomic_read(&eb->refs) > 1) {
+                ret = 0;
+                goto out;
+        }
+        /* at this point we can safely release the extent buffer */
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = 0; i < num_pages; i++)
+                page_cache_release(extent_buffer_page(eb, i));
+        rb_erase(&eb->rb_node, &tree->buffer);
+        __free_extent_buffer(eb);
+out:
+        spin_unlock(&tree->buffer_lock);
+        return ret;
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..c5b483a79137
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,269 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+#include <linux/rbtree.h>
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG_DONE (1 << 7)
+#define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_ORDERED (1 << 9)
+#define EXTENT_ORDERED_METADATA (1 << 10)
+#define EXTENT_BOUNDARY (1 << 11)
+#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+/*
+ * page->private values.  Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
+struct extent_state;
+typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
+                                       struct bio *bio, int mirror_num,
+                                       unsigned long bio_flags);
+struct extent_io_ops {
+        int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+                             u64 start, u64 end, int *page_started,
+                             unsigned long *nr_written);
+        int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
+        int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+        extent_submit_bio_hook_t *submit_bio_hook;
+        int (*merge_bio_hook)(struct page *page, unsigned long offset,
+                              size_t size, struct bio *bio,
+                              unsigned long bio_flags);
+        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+        int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
+                                       u64 start, u64 end,
+                                       struct extent_state *state);
+        int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
+                                        u64 start, u64 end,
+                                       struct extent_state *state);
+        int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
+                                    struct extent_state *state);
+        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+                                      struct extent_state *state, int uptodate);
+        int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+                            unsigned long old, unsigned long bits);
+        int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
+                            unsigned long old, unsigned long bits);
+        int (*write_cache_pages_lock_hook)(struct page *page);
+};
+struct extent_io_tree {
+        struct rb_root state;
+        struct rb_root buffer;
+        struct address_space *mapping;
+        u64 dirty_bytes;
+        spinlock_t lock;
+        spinlock_t buffer_lock;
+        struct extent_io_ops *ops;
+};
+struct extent_state {
+        u64 start;
+        u64 end; /* inclusive */
+        struct rb_node rb_node;
+        struct extent_io_tree *tree;
+        wait_queue_head_t wq;
+        atomic_t refs;
+        unsigned long state;
+        /* for use by the FS */
+        u64 private;
+        struct list_head leak_list;
+};
+struct extent_buffer {
+        u64 start;
+        unsigned long len;
+        char *map_token;
+        char *kaddr;
+        unsigned long map_start;
+        unsigned long map_len;
+        struct page *first_page;
+        atomic_t refs;
+        int flags;
+        struct list_head leak_list;
+        struct rb_node rb_node;
+        struct mutex mutex;
+};
+struct extent_map_tree;
+static inline struct extent_state *extent_state_next(struct extent_state *state)
+{
+        struct rb_node *node;
+        node = rb_next(&state->rb_node);
+        if (!node)
+                return NULL;
+        return rb_entry(node, struct extent_state, rb_node);
+}
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+                                          struct page *page,
+                                          size_t page_offset,
+                                          u64 start, u64 len,
+                                          int create);
+void extent_io_tree_init(struct extent_io_tree *tree,
+                          struct address_space *mapping, gfp_t mask);
+int try_release_extent_mapping(struct extent_map_tree *map,
+                               struct extent_io_tree *tree, struct page *page,
+                               gfp_t mask);
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_state(struct extent_map_tree *map,
+                             struct extent_io_tree *tree, struct page *page,
+                             gfp_t mask);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+                    gfp_t mask);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+                          get_extent_t *get_extent);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+u64 count_range_bits(struct extent_io_tree *tree,
+                     u64 *start, u64 search_end,
+                     u64 max_bytes, unsigned long bits);
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                   int bits, int filled);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                      int bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                     int bits, int wake, int delete, gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                    int bits, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                        gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+                   gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask);
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask);
+int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
+                                  u64 end, gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask);
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+                          u64 *start_ret, u64 *end_ret, int bits);
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+                                                 u64 start, int bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+                          struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+                          get_extent_t *get_extent,
+                          struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+                              u64 start, u64 end, get_extent_t *get_extent,
+                              int mode);
+int extent_writepages(struct extent_io_tree *tree,
+                      struct address_space *mapping,
+                      get_extent_t *get_extent,
+                      struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+                     struct address_space *mapping,
+                     struct list_head *pages, unsigned nr_pages,
+                     get_extent_t get_extent);
+int extent_prepare_write(struct extent_io_tree *tree,
+                         struct inode *inode, struct page *page,
+                         unsigned from, unsigned to, get_extent_t *get_extent);
+int extent_commit_write(struct extent_io_tree *tree,
+                        struct inode *inode, struct page *page,
+                        unsigned from, unsigned to);
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+                get_extent_t *get_extent);
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+                                          u64 start, unsigned long len,
+                                          struct page *page0,
+                                          gfp_t mask);
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+                                         u64 start, unsigned long len,
+                                          gfp_t mask);
+void free_extent_buffer(struct extent_buffer *eb);
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+                             struct extent_buffer *eb, u64 start, int wait,
+                             get_extent_t *get_extent, int mirror_num);
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+        atomic_inc(&eb->refs);
+}
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+                          unsigned long start,
+                          unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+                        unsigned long start,
+                        unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+                         unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+                        unsigned long dst_offset, unsigned long src_offset,
+                        unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+                           unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+                           unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+                          unsigned long start, unsigned long len);
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+                                    struct extent_buffer *eb);
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+                              struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+                             struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+                               struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+                                struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+                           struct extent_buffer *eb);
+int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+                      unsigned long min_len, char **token, char **map,
+                      unsigned long *map_start,
+                      unsigned long *map_len, int km);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+                      unsigned long min_len, char **token, char **map,
+                      unsigned long *map_start,
+                      unsigned long *map_len, int km);
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+int release_extent_buffer_tail_pages(struct extent_buffer *eb);
+int extent_range_uptodate(struct extent_io_tree *tree,
+                          u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+                                struct extent_io_tree *tree,
+                                u64 start, u64 end, struct page *locked_page,
+                                int unlock_page,
+                                int clear_unlock,
+                                int clear_delalloc, int clear_dirty,
+                                int set_writeback,
+                                int end_writeback);
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..4a83e33ada32
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/version.h>
+#include <linux/hardirq.h>
+#include "extent_map.h"
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+                                       unsigned long extra_flags,
+                                       void (*ctor)(void *, struct kmem_cache *,
+                                                    unsigned long));
+static struct kmem_cache *extent_map_cache;
+int __init extent_map_init(void)
+{
+        extent_map_cache = btrfs_cache_create("extent_map",
+                                            sizeof(struct extent_map), 0,
+                                            NULL);
+        if (!extent_map_cache)
+                return -ENOMEM;
+        return 0;
+}
+void extent_map_exit(void)
+{
+        if (extent_map_cache)
+                kmem_cache_destroy(extent_map_cache);
+}
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree:               tree to initialize
+ * @mask:               flags for memory allocations during tree operations
+ *
+ * Initialize the extent tree @tree.  Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+{
+        tree->map.rb_node = NULL;
+        spin_lock_init(&tree->lock);
+}
+EXPORT_SYMBOL(extent_map_tree_init);
+/**
+ * alloc_extent_map - allocate new extent map structure
+ * @mask:       memory allocation flags
+ *
+ * Allocate a new extent_map structure.  The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
+struct extent_map *alloc_extent_map(gfp_t mask)
+{
+        struct extent_map *em;
+        em = kmem_cache_alloc(extent_map_cache, mask);
+        if (!em || IS_ERR(em))
+                return em;
+        em->in_tree = 0;
+        em->flags = 0;
+        atomic_set(&em->refs, 1);
+        return em;
+}
+EXPORT_SYMBOL(alloc_extent_map);
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em:         extent map beeing releasead
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
+void free_extent_map(struct extent_map *em)
+{
+        if (!em)
+                return;
+        WARN_ON(atomic_read(&em->refs) == 0);
+        if (atomic_dec_and_test(&em->refs)) {
+                WARN_ON(em->in_tree);
+                kmem_cache_free(extent_map_cache, em);
+        }
+}
+EXPORT_SYMBOL(free_extent_map);
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct extent_map *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct extent_map, rb_node);
+                WARN_ON(!entry->in_tree);
+                if (offset < entry->start)
+                        p = &(*p)->rb_left;
+                else if (offset >= extent_map_end(entry))
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        entry = rb_entry(node, struct extent_map, rb_node);
+        entry->in_tree = 1;
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+/*
+ * search through the tree for an extent_map with a given offset.  If
+ * it can't be found, try to find some neighboring extents
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+                                     struct rb_node **prev_ret,
+                                     struct rb_node **next_ret)
+{
+        struct rb_node *n = root->rb_node;
+        struct rb_node *prev = NULL;
+        struct rb_node *orig_prev = NULL;
+        struct extent_map *entry;
+        struct extent_map *prev_entry = NULL;
+        while (n) {
+                entry = rb_entry(n, struct extent_map, rb_node);
+                prev = n;
+                prev_entry = entry;
+                WARN_ON(!entry->in_tree);
+                if (offset < entry->start)
+                        n = n->rb_left;
+                else if (offset >= extent_map_end(entry))
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        if (prev_ret) {
+                orig_prev = prev;
+                while (prev && offset >= extent_map_end(prev_entry)) {
+                        prev = rb_next(prev);
+                        prev_entry = rb_entry(prev, struct extent_map, rb_node);
+                }
+                *prev_ret = prev;
+                prev = orig_prev;
+        }
+        if (next_ret) {
+                prev_entry = rb_entry(prev, struct extent_map, rb_node);
+                while (prev && offset < prev_entry->start) {
+                        prev = rb_prev(prev);
+                        prev_entry = rb_entry(prev, struct extent_map, rb_node);
+                }
+                *next_ret = prev;
+        }
+        return NULL;
+}
+/*
+ * look for an offset in the tree, and if it can't be found, return
+ * the first offset we can find smaller than 'offset'.
+ */
+static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+{
+        struct rb_node *prev;
+        struct rb_node *ret;
+        ret = __tree_search(root, offset, &prev, NULL);
+        if (!ret)
+                return prev;
+        return ret;
+}
+/* check to see if two extent_map structs are adjacent and safe to merge */
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+        if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+                return 0;
+        /*
+         * don't merge compressed extents, we need to know their
+         * actual size
+         */
+        if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+                return 0;
+        if (extent_map_end(prev) == next->start &&
+            prev->flags == next->flags &&
+            prev->bdev == next->bdev &&
+            ((next->block_start == EXTENT_MAP_HOLE &&
+              prev->block_start == EXTENT_MAP_HOLE) ||
+             (next->block_start == EXTENT_MAP_INLINE &&
+              prev->block_start == EXTENT_MAP_INLINE) ||
+             (next->block_start == EXTENT_MAP_DELALLOC &&
+              prev->block_start == EXTENT_MAP_DELALLOC) ||
+             (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+              next->block_start == extent_map_block_end(prev)))) {
+                return 1;
+        }
+        return 0;
+}
+/**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree:       tree to insert new map in
+ * @em:         map to insert
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings.  The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was sucessfull.
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+                       struct extent_map *em)
+{
+        int ret = 0;
+        struct extent_map *merge = NULL;
+        struct rb_node *rb;
+        struct extent_map *exist;
+        exist = lookup_extent_mapping(tree, em->start, em->len);
+        if (exist) {
+                free_extent_map(exist);
+                ret = -EEXIST;
+                goto out;
+        }
+        assert_spin_locked(&tree->lock);
+        rb = tree_insert(&tree->map, em->start, &em->rb_node);
+        if (rb) {
+                ret = -EEXIST;
+                free_extent_map(merge);
+                goto out;
+        }
+        atomic_inc(&em->refs);
+        if (em->start != 0) {
+                rb = rb_prev(&em->rb_node);
+                if (rb)
+                        merge = rb_entry(rb, struct extent_map, rb_node);
+                if (rb && mergable_maps(merge, em)) {
+                        em->start = merge->start;
+                        em->len += merge->len;
+                        em->block_len += merge->block_len;
+                        em->block_start = merge->block_start;
+                        merge->in_tree = 0;
+                        rb_erase(&merge->rb_node, &tree->map);
+                        free_extent_map(merge);
+                }
+         }
+        rb = rb_next(&em->rb_node);
+        if (rb)
+                merge = rb_entry(rb, struct extent_map, rb_node);
+        if (rb && mergable_maps(em, merge)) {
+                em->len += merge->len;
+                em->block_len += merge->len;
+                rb_erase(&merge->rb_node, &tree->map);
+                merge->in_tree = 0;
+                free_extent_map(merge);
+        }
+out:
+        return ret;
+}
+EXPORT_SYMBOL(add_extent_mapping);
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+        if (start + len < start)
+                return (u64)-1;
+        return start + len;
+}
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree:       tree to lookup in
+ * @start:      byte offset to start the search
+ * @len:        length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.  There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+                                         u64 start, u64 len)
+{
+        struct extent_map *em;
+        struct rb_node *rb_node;
+        struct rb_node *prev = NULL;
+        struct rb_node *next = NULL;
+        u64 end = range_end(start, len);
+        assert_spin_locked(&tree->lock);
+        rb_node = __tree_search(&tree->map, start, &prev, &next);
+        if (!rb_node && prev) {
+                em = rb_entry(prev, struct extent_map, rb_node);
+                if (end > em->start && start < extent_map_end(em))
+                        goto found;
+        }
+        if (!rb_node && next) {
+                em = rb_entry(next, struct extent_map, rb_node);
+                if (end > em->start && start < extent_map_end(em))
+                        goto found;
+        }
+        if (!rb_node) {
+                em = NULL;
+                goto out;
+        }
+        if (IS_ERR(rb_node)) {
+                em = ERR_PTR(PTR_ERR(rb_node));
+                goto out;
+        }
+        em = rb_entry(rb_node, struct extent_map, rb_node);
+        if (end > em->start && start < extent_map_end(em))
+                goto found;
+        em = NULL;
+        goto out;
+found:
+        atomic_inc(&em->refs);
+out:
+        return em;
+}
+EXPORT_SYMBOL(lookup_extent_mapping);
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree:       extent tree to remove from
+ * @em:         extent map beeing removed
+ *
+ * Removes @em from @tree.  No reference counts are dropped, and no checks
+ * are done to see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+        int ret = 0;
+        WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+        assert_spin_locked(&tree->lock);
+        rb_erase(&em->rb_node, &tree->map);
+        em->in_tree = 0;
+        return ret;
+}
+EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..fb6eeef06bb0
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+#include <linux/rbtree.h>
+#define EXTENT_MAP_LAST_BYTE (u64)-4
+#define EXTENT_MAP_HOLE (u64)-3
+#define EXTENT_MAP_INLINE (u64)-2
+#define EXTENT_MAP_DELALLOC (u64)-1
+/* bits for the flags field */
+#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
+#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+struct extent_map {
+        struct rb_node rb_node;
+        /* all of these are in bytes */
+        u64 start;
+        u64 len;
+        u64 orig_start;
+        u64 block_start;
+        u64 block_len;
+        unsigned long flags;
+        struct block_device *bdev;
+        atomic_t refs;
+        int in_tree;
+};
+struct extent_map_tree {
+        struct rb_root map;
+        spinlock_t lock;
+};
+static inline u64 extent_map_end(struct extent_map *em)
+{
+        if (em->start + em->len < em->start)
+                return (u64)-1;
+        return em->start + em->len;
+}
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+        if (em->block_start + em->block_len < em->block_start)
+                return (u64)-1;
+        return em->block_start + em->block_len;
+}
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+                                         u64 start, u64 len);
+int add_extent_mapping(struct extent_map_tree *tree,
+                       struct extent_map *em);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+struct extent_map *alloc_extent_map(gfp_t mask);
+void free_extent_map(struct extent_map *em);
+int __init extent_map_init(void);
+void extent_map_exit(void);
+#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..964652435fd1
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+                                   sizeof(struct btrfs_item) * 2) / \
+                                  size) - 1))
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+                                   sizeof(struct btrfs_ordered_sum)) / \
+                                   sizeof(struct btrfs_sector_sum) * \
+                                   (r)->sectorsize - (r)->sectorsize)
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             u64 objectid, u64 pos,
+                             u64 disk_offset, u64 disk_num_bytes,
+                             u64 num_bytes, u64 offset, u64 ram_bytes,
+                             u8 compression, u8 encryption, u16 other_encoding)
+{
+        int ret = 0;
+        struct btrfs_file_extent_item *item;
+        struct btrfs_key file_key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        file_key.objectid = objectid;
+        file_key.offset = pos;
+        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+                                      sizeof(*item));
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0],
+                              struct btrfs_file_extent_item);
+        btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
+        btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
+        btrfs_set_file_extent_offset(leaf, item, offset);
+        btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+        btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+        btrfs_set_file_extent_generation(leaf, item, trans->transid);
+        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+        btrfs_set_file_extent_compression(leaf, item, compression);
+        btrfs_set_file_extent_encryption(leaf, item, encryption);
+        btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, int cow)
+{
+        int ret;
+        struct btrfs_key file_key;
+        struct btrfs_key found_key;
+        struct btrfs_csum_item *item;
+        struct extent_buffer *leaf;
+        u64 csum_offset = 0;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        int csums_in_item;
+        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        file_key.offset = bytenr;
+        btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+        ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
+        if (ret < 0)
+                goto fail;
+        leaf = path->nodes[0];
+        if (ret > 0) {
+                ret = 1;
+                if (path->slots[0] == 0)
+                        goto fail;
+                path->slots[0]--;
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+                        goto fail;
+                csum_offset = (bytenr - found_key.offset) >>
+                                root->fs_info->sb->s_blocksize_bits;
+                csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+                csums_in_item /= csum_size;
+                if (csum_offset >= csums_in_item) {
+                        ret = -EFBIG;
+                        goto fail;
+                }
+        }
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+        item = (struct btrfs_csum_item *)((unsigned char *)item +
+                                          csum_offset * csum_size);
+        return item;
+fail:
+        if (ret > 0)
+                ret = -ENOENT;
+        return ERR_PTR(ret);
+}
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid,
+                             u64 offset, int mod)
+{
+        int ret;
+        struct btrfs_key file_key;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        file_key.objectid = objectid;
+        file_key.offset = offset;
+        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+        ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+        return ret;
+}
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                          struct bio *bio, u32 *dst)
+{
+        u32 sum;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        int bio_index = 0;
+        u64 offset;
+        u64 item_start_offset = 0;
+        u64 item_last_offset = 0;
+        u64 disk_bytenr;
+        u32 diff;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_csum_item *item = NULL;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        path = btrfs_alloc_path();
+        if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+                path->reada = 2;
+        WARN_ON(bio->bi_vcnt <= 0);
+        disk_bytenr = (u64)bio->bi_sector << 9;
+        while (bio_index < bio->bi_vcnt) {
+                offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+                ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
+                if (ret == 0)
+                        goto found;
+                if (!item || disk_bytenr < item_start_offset ||
+                    disk_bytenr >= item_last_offset) {
+                        struct btrfs_key found_key;
+                        u32 item_size;
+                        if (item)
+                                btrfs_release_path(root, path);
+                        item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+                                                 path, disk_bytenr, 0);
+                        if (IS_ERR(item)) {
+                                ret = PTR_ERR(item);
+                                if (ret == -ENOENT || ret == -EFBIG)
+                                        ret = 0;
+                                sum = 0;
+                                if (BTRFS_I(inode)->root->root_key.objectid ==
+                                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                                        set_extent_bits(io_tree, offset,
+                                                offset + bvec->bv_len - 1,
+                                                EXTENT_NODATASUM, GFP_NOFS);
+                                } else {
+                                        printk(KERN_INFO "btrfs no csum found "
+                                               "for inode %lu start %llu\n",
+                                               inode->i_ino,
+                                               (unsigned long long)offset);
+                                }
+                                item = NULL;
+                                btrfs_release_path(root, path);
+                                goto found;
+                        }
+                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                              path->slots[0]);
+                        item_start_offset = found_key.offset;
+                        item_size = btrfs_item_size_nr(path->nodes[0],
+                                                       path->slots[0]);
+                        item_last_offset = item_start_offset +
+                                (item_size / csum_size) *
+                                root->sectorsize;
+                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                              struct btrfs_csum_item);
+                }
+                /*
+                 * this byte range must be able to fit inside
+                 * a single leaf so it will also fit inside a u32
+                 */
+                diff = disk_bytenr - item_start_offset;
+                diff = diff / root->sectorsize;
+                diff = diff * csum_size;
+                read_extent_buffer(path->nodes[0], &sum,
+                                   ((unsigned long)item) + diff,
+                                   csum_size);
+found:
+                if (dst)
+                        *dst++ = sum;
+                else
+                        set_state_private(io_tree, offset, sum);
+                disk_bytenr += bvec->bv_len;
+                bio_index++;
+                bvec++;
+        }
+        btrfs_free_path(path);
+        return 0;
+}
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+                             struct list_head *list)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_csum_item *item;
+        unsigned long offset;
+        int ret;
+        size_t size;
+        u64 csum_end;
+        u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        key.offset = start;
+        key.type = BTRFS_EXTENT_CSUM_KEY;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto fail;
+        if (ret > 0 && path->slots[0] > 0) {
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+                if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+                    key.type == BTRFS_EXTENT_CSUM_KEY) {
+                        offset = (start - key.offset) >>
+                                 root->fs_info->sb->s_blocksize_bits;
+                        if (offset * csum_size <
+                            btrfs_item_size_nr(leaf, path->slots[0] - 1))
+                                path->slots[0]--;
+                }
+        }
+        while (start <= end) {
+                leaf = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto fail;
+                        if (ret > 0)
+                                break;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+                    key.type != BTRFS_EXTENT_CSUM_KEY)
+                        break;
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.offset > end)
+                        break;
+                if (key.offset > start)
+                        start = key.offset;
+                size = btrfs_item_size_nr(leaf, path->slots[0]);
+                csum_end = key.offset + (size / csum_size) * root->sectorsize;
+                if (csum_end <= start) {
+                        path->slots[0]++;
+                        continue;
+                }
+                csum_end = min(csum_end, end + 1);
+                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                      struct btrfs_csum_item);
+                while (start < csum_end) {
+                        size = min_t(size_t, csum_end - start,
+                                        MAX_ORDERED_SUM_BYTES(root));
+                        sums = kzalloc(btrfs_ordered_sum_size(root, size),
+                                        GFP_NOFS);
+                        BUG_ON(!sums);
+                        sector_sum = sums->sums;
+                        sums->bytenr = start;
+                        sums->len = size;
+                        offset = (start - key.offset) >>
+                                root->fs_info->sb->s_blocksize_bits;
+                        offset *= csum_size;
+                        while (size > 0) {
+                                read_extent_buffer(path->nodes[0],
+                                                &sector_sum->sum,
+                                                ((unsigned long)item) +
+                                                offset, csum_size);
+                                sector_sum->bytenr = start;
+                                size -= root->sectorsize;
+                                start += root->sectorsize;
+                                offset += csum_size;
+                                sector_sum++;
+                        }
+                        list_add_tail(&sums->list, list);
+                }
+                path->slots[0]++;
+        }
+        ret = 0;
+fail:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+                       struct bio *bio, u64 file_start, int contig)
+{
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_ordered_extent *ordered;
+        char *data;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        int bio_index = 0;
+        unsigned long total_bytes = 0;
+        unsigned long this_sum_bytes = 0;
+        u64 offset;
+        u64 disk_bytenr;
+        WARN_ON(bio->bi_vcnt <= 0);
+        sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+        if (!sums)
+                return -ENOMEM;
+        sector_sum = sums->sums;
+        disk_bytenr = (u64)bio->bi_sector << 9;
+        sums->len = bio->bi_size;
+        INIT_LIST_HEAD(&sums->list);
+        if (contig)
+                offset = file_start;
+        else
+                offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+        ordered = btrfs_lookup_ordered_extent(inode, offset);
+        BUG_ON(!ordered);
+        sums->bytenr = ordered->start;
+        while (bio_index < bio->bi_vcnt) {
+                if (!contig)
+                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+                if (!contig && (offset >= ordered->file_offset + ordered->len ||
+                    offset < ordered->file_offset)) {
+                        unsigned long bytes_left;
+                        sums->len = this_sum_bytes;
+                        this_sum_bytes = 0;
+                        btrfs_add_ordered_sum(inode, ordered, sums);
+                        btrfs_put_ordered_extent(ordered);
+                        bytes_left = bio->bi_size - total_bytes;
+                        sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+                                       GFP_NOFS);
+                        BUG_ON(!sums);
+                        sector_sum = sums->sums;
+                        sums->len = bytes_left;
+                        ordered = btrfs_lookup_ordered_extent(inode, offset);
+                        BUG_ON(!ordered);
+                        sums->bytenr = ordered->start;
+                }
+                data = kmap_atomic(bvec->bv_page, KM_USER0);
+                sector_sum->sum = ~(u32)0;
+                sector_sum->sum = btrfs_csum_data(root,
+                                                  data + bvec->bv_offset,
+                                                  sector_sum->sum,
+                                                  bvec->bv_len);
+                kunmap_atomic(data, KM_USER0);
+                btrfs_csum_final(sector_sum->sum,
+                                 (char *)&sector_sum->sum);
+                sector_sum->bytenr = disk_bytenr;
+                sector_sum++;
+                bio_index++;
+                total_bytes += bvec->bv_len;
+                this_sum_bytes += bvec->bv_len;
+                disk_bytenr += bvec->bv_len;
+                offset += bvec->bv_len;
+                bvec++;
+        }
+        this_sum_bytes = 0;
+        btrfs_add_ordered_sum(inode, ordered, sums);
+        btrfs_put_ordered_extent(ordered);
+        return 0;
+}
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct btrfs_key *key,
+                                      u64 bytenr, u64 len)
+{
+        struct extent_buffer *leaf;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        u64 csum_end;
+        u64 end_byte = bytenr + len;
+        u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+        int ret;
+        leaf = path->nodes[0];
+        csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+        csum_end <<= root->fs_info->sb->s_blocksize_bits;
+        csum_end += key->offset;
+        if (key->offset < bytenr && csum_end <= end_byte) {
+                /*
+                 *         [ bytenr - len ]
+                 *         [   ]
+                 *   [csum     ]
+                 *   A simple truncate off the end of the item
+                 */
+                u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+                new_size *= csum_size;
+                ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+                BUG_ON(ret);
+        } else if (key->offset >= bytenr && csum_end > end_byte &&
+                   end_byte > key->offset) {
+                /*
+                 *         [ bytenr - len ]
+                 *                 [ ]
+                 *                 [csum     ]
+                 * we need to truncate from the beginning of the csum
+                 */
+                u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+                new_size *= csum_size;
+                ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+                BUG_ON(ret);
+                key->offset = end_byte;
+                ret = btrfs_set_item_key_safe(trans, root, path, key);
+                BUG_ON(ret);
+        } else {
+                BUG();
+        }
+        return 0;
+}
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, u64 bytenr, u64 len)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        u64 end_byte = bytenr + len;
+        u64 csum_end;
+        struct extent_buffer *leaf;
+        int ret;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+        root = root->fs_info->csum_root;
+        path = btrfs_alloc_path();
+        while (1) {
+                key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+                key.offset = end_byte - 1;
+                key.type = BTRFS_EXTENT_CSUM_KEY;
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                goto out;
+                        path->slots[0]--;
+                }
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+                    key.type != BTRFS_EXTENT_CSUM_KEY) {
+                        break;
+                }
+                if (key.offset >= end_byte)
+                        break;
+                csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+                csum_end <<= blocksize_bits;
+                csum_end += key.offset;
+                /* this csum ends before we start, we're done */
+                if (csum_end <= bytenr)
+                        break;
+                /* delete the entire item, it is inside our range */
+                if (key.offset >= bytenr && csum_end <= end_byte) {
+                        ret = btrfs_del_item(trans, root, path);
+                        BUG_ON(ret);
+                        if (key.offset == bytenr)
+                                break;
+                } else if (key.offset < bytenr && csum_end > end_byte) {
+                        unsigned long offset;
+                        unsigned long shift_len;
+                        unsigned long item_offset;
+                        /*
+                         *        [ bytenr - len ]
+                         *     [csum                ]
+                         *
+                         * Our bytes are in the middle of the csum,
+                         * we need to split this item and insert a new one.
+                         *
+                         * But we can't drop the path because the
+                         * csum could change, get removed, extended etc.
+                         *
+                         * The trick here is the max size of a csum item leaves
+                         * enough room in the tree block for a single
+                         * item header.  So, we split the item in place,
+                         * adding a new header pointing to the existing
+                         * bytes.  Then we loop around again and we have
+                         * a nicely formed csum item that we can neatly
+                         * truncate.
+                         */
+                        offset = (bytenr - key.offset) >> blocksize_bits;
+                        offset *= csum_size;
+                        shift_len = (len >> blocksize_bits) * csum_size;
+                        item_offset = btrfs_item_ptr_offset(leaf,
+                                                            path->slots[0]);
+                        memset_extent_buffer(leaf, 0, item_offset + offset,
+                                             shift_len);
+                        key.offset = bytenr;
+                        /*
+                         * btrfs_split_item returns -EAGAIN when the
+                         * item changed size or key
+                         */
+                        ret = btrfs_split_item(trans, root, path, &key, offset);
+                        BUG_ON(ret && ret != -EAGAIN);
+                        key.offset = end_byte - 1;
+                } else {
+                        ret = truncate_one_csum(trans, root, path,
+                                                &key, bytenr, len);
+                        BUG_ON(ret);
+                        if (key.offset < bytenr)
+                                break;
+                }
+                btrfs_release_path(root, path);
+        }
+out:
+        btrfs_free_path(path);
+        return 0;
+}
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_ordered_sum *sums)
+{
+        u64 bytenr;
+        int ret;
+        struct btrfs_key file_key;
+        struct btrfs_key found_key;
+        u64 next_offset;
+        u64 total_bytes = 0;
+        int found_next;
+        struct btrfs_path *path;
+        struct btrfs_csum_item *item;
+        struct btrfs_csum_item *item_end;
+        struct extent_buffer *leaf = NULL;
+        u64 csum_offset;
+        struct btrfs_sector_sum *sector_sum;
+        u32 nritems;
+        u32 ins_size;
+        char *eb_map;
+        char *eb_token;
+        unsigned long map_len;
+        unsigned long map_start;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        sector_sum = sums->sums;
+again:
+        next_offset = (u64)-1;
+        found_next = 0;
+        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        file_key.offset = sector_sum->bytenr;
+        bytenr = sector_sum->bytenr;
+        btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+        item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+        if (!IS_ERR(item)) {
+                leaf = path->nodes[0];
+                ret = 0;
+                goto found;
+        }
+        ret = PTR_ERR(item);
+        if (ret == -EFBIG) {
+                u32 item_size;
+                /* we found one, but it isn't big enough yet */
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                if ((item_size / csum_size) >=
+                    MAX_CSUM_ITEMS(root, csum_size)) {
+                        /* already at max size, make a new one */
+                        goto insert;
+                }
+        } else {
+                int slot = path->slots[0] + 1;
+                /* we didn't find a csum item, insert one */
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                if (path->slots[0] >= nritems - 1) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 1)
+                                found_next = 1;
+                        if (ret != 0)
+                                goto insert;
+                        slot = 0;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+                if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+                    found_key.type != BTRFS_EXTENT_CSUM_KEY) {
+                        found_next = 1;
+                        goto insert;
+                }
+                next_offset = found_key.offset;
+                found_next = 1;
+                goto insert;
+        }
+        /*
+         * at this point, we know the tree has an item, but it isn't big
+         * enough yet to put our csum in.  Grow it
+         */
+        btrfs_release_path(root, path);
+        ret = btrfs_search_slot(trans, root, &file_key, path,
+                                csum_size, 1);
+        if (ret < 0)
+                goto fail_unlock;
+        if (ret > 0) {
+                if (path->slots[0] == 0)
+                        goto insert;
+                path->slots[0]--;
+        }
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        csum_offset = (bytenr - found_key.offset) >>
+                        root->fs_info->sb->s_blocksize_bits;
+        if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+            found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+            csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
+                goto insert;
+        }
+        if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+            csum_size) {
+                u32 diff = (csum_offset + 1) * csum_size;
+                /*
+                 * is the item big enough already?  we dropped our lock
+                 * before and need to recheck
+                 */
+                if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
+                        goto csum;
+                diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+                if (diff != csum_size)
+                        goto insert;
+                ret = btrfs_extend_item(trans, root, path, diff);
+                BUG_ON(ret);
+                goto csum;
+        }
+insert:
+        btrfs_release_path(root, path);
+        csum_offset = 0;
+        if (found_next) {
+                u64 tmp = total_bytes + root->sectorsize;
+                u64 next_sector = sector_sum->bytenr;
+                struct btrfs_sector_sum *next = sector_sum + 1;
+                while (tmp < sums->len) {
+                        if (next_sector + root->sectorsize != next->bytenr)
+                                break;
+                        tmp += root->sectorsize;
+                        next_sector = next->bytenr;
+                        next++;
+                }
+                tmp = min(tmp, next_offset - file_key.offset);
+                tmp >>= root->fs_info->sb->s_blocksize_bits;
+                tmp = max((u64)1, tmp);
+                tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+                ins_size = csum_size * tmp;
+        } else {
+                ins_size = csum_size;
+        }
+        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+                                      ins_size);
+        if (ret < 0)
+                goto fail_unlock;
+        if (ret != 0) {
+                WARN_ON(1);
+                goto fail_unlock;
+        }
+csum:
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+        ret = 0;
+        item = (struct btrfs_csum_item *)((unsigned char *)item +
+                                          csum_offset * csum_size);
+found:
+        item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+        item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
+                                      btrfs_item_size_nr(leaf, path->slots[0]));
+        eb_token = NULL;
+        cond_resched();
+next_sector:
+        if (!eb_token ||
+           (unsigned long)item + csum_size >= map_start + map_len) {
+                int err;
+                if (eb_token)
+                        unmap_extent_buffer(leaf, eb_token, KM_USER1);
+                eb_token = NULL;
+                err = map_private_extent_buffer(leaf, (unsigned long)item,
+                                                csum_size,
+                                                &eb_token, &eb_map,
+                                                &map_start, &map_len, KM_USER1);
+                if (err)
+                        eb_token = NULL;
+        }
+        if (eb_token) {
+                memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
+                       &sector_sum->sum, csum_size);
+        } else {
+                write_extent_buffer(leaf, &sector_sum->sum,
+                                    (unsigned long)item, csum_size);
+        }
+        total_bytes += root->sectorsize;
+        sector_sum++;
+        if (total_bytes < sums->len) {
+                item = (struct btrfs_csum_item *)((char *)item +
+                                                  csum_size);
+                if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
+                    sector_sum->bytenr) {
+                        bytenr = sector_sum->bytenr;
+                        goto next_sector;
+                }
+        }
+        if (eb_token) {
+                unmap_extent_buffer(leaf, eb_token, KM_USER1);
+                eb_token = NULL;
+        }
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        cond_resched();
+        if (total_bytes < sums->len) {
+                btrfs_release_path(root, path);
+                goto again;
+        }
+out:
+        btrfs_free_path(path);
+        return ret;
+fail_unlock:
+        goto out;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..90268334145e
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1288 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "compat.h"
+/* simple helper to fault in pages and copy.  This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+                                         int write_bytes,
+                                         struct page **prepared_pages,
+                                         const char __user *buf)
+{
+        long page_fault = 0;
+        int i;
+        int offset = pos & (PAGE_CACHE_SIZE - 1);
+        for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+                size_t count = min_t(size_t,
+                                     PAGE_CACHE_SIZE - offset, write_bytes);
+                struct page *page = prepared_pages[i];
+                fault_in_pages_readable(buf, count);
+                /* Copy data from userspace to the current page */
+                kmap(page);
+                page_fault = __copy_from_user(page_address(page) + offset,
+                                              buf, count);
+                /* Flush processor's dcache for this page */
+                flush_dcache_page(page);
+                kunmap(page);
+                buf += count;
+                write_bytes -= count;
+                if (page_fault)
+                        break;
+        }
+        return page_fault ? -EFAULT : 0;
+}
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+        size_t i;
+        for (i = 0; i < num_pages; i++) {
+                if (!pages[i])
+                        break;
+                /* page checked is some magic around finding pages that
+                 * have been modified without going through btrfs_set_page_dirty
+                 * clear it here
+                 */
+                ClearPageChecked(pages[i]);
+                unlock_page(pages[i]);
+                mark_page_accessed(pages[i]);
+                page_cache_release(pages[i]);
+        }
+}
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
+static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct file *file,
+                                   struct page **pages,
+                                   size_t num_pages,
+                                   loff_t pos,
+                                   size_t write_bytes)
+{
+        int err = 0;
+        int i;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        u64 hint_byte;
+        u64 num_bytes;
+        u64 start_pos;
+        u64 end_of_last_block;
+        u64 end_pos = pos + write_bytes;
+        loff_t isize = i_size_read(inode);
+        start_pos = pos & ~((u64)root->sectorsize - 1);
+        num_bytes = (write_bytes + pos - start_pos +
+                    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+        end_of_last_block = start_pos + num_bytes - 1;
+        lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+        trans = btrfs_join_transaction(root, 1);
+        if (!trans) {
+                err = -ENOMEM;
+                goto out_unlock;
+        }
+        btrfs_set_trans_block_group(trans, inode);
+        hint_byte = 0;
+        set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+        /* check for reserved extents on each page, we don't want
+         * to reset the delalloc bit on things that already have
+         * extents reserved.
+         */
+        btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+        for (i = 0; i < num_pages; i++) {
+                struct page *p = pages[i];
+                SetPageUptodate(p);
+                ClearPageChecked(p);
+                set_page_dirty(p);
+        }
+        if (end_pos > isize) {
+                i_size_write(inode, end_pos);
+                btrfs_update_inode(trans, root, inode);
+        }
+        err = btrfs_end_transaction(trans, root);
+out_unlock:
+        unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+        return err;
+}
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end].  Existing extents are split as required.
+ */
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+                            int skip_pinned)
+{
+        struct extent_map *em;
+        struct extent_map *split = NULL;
+        struct extent_map *split2 = NULL;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        u64 len = end - start + 1;
+        int ret;
+        int testend = 1;
+        unsigned long flags;
+        int compressed = 0;
+        WARN_ON(end < start);
+        if (end == (u64)-1) {
+                len = (u64)-1;
+                testend = 0;
+        }
+        while (1) {
+                if (!split)
+                        split = alloc_extent_map(GFP_NOFS);
+                if (!split2)
+                        split2 = alloc_extent_map(GFP_NOFS);
+                spin_lock(&em_tree->lock);
+                em = lookup_extent_mapping(em_tree, start, len);
+                if (!em) {
+                        spin_unlock(&em_tree->lock);
+                        break;
+                }
+                flags = em->flags;
+                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+                        spin_unlock(&em_tree->lock);
+                        if (em->start <= start &&
+                            (!testend || em->start + em->len >= start + len)) {
+                                free_extent_map(em);
+                                break;
+                        }
+                        if (start < em->start) {
+                                len = em->start - start;
+                        } else {
+                                len = start + len - (em->start + em->len);
+                                start = em->start + em->len;
+                        }
+                        free_extent_map(em);
+                        continue;
+                }
+                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+                remove_extent_mapping(em_tree, em);
+                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+                    em->start < start) {
+                        split->start = em->start;
+                        split->len = start - em->start;
+                        split->orig_start = em->orig_start;
+                        split->block_start = em->block_start;
+                        if (compressed)
+                                split->block_len = em->block_len;
+                        else
+                                split->block_len = split->len;
+                        split->bdev = em->bdev;
+                        split->flags = flags;
+                        ret = add_extent_mapping(em_tree, split);
+                        BUG_ON(ret);
+                        free_extent_map(split);
+                        split = split2;
+                        split2 = NULL;
+                }
+                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+                    testend && em->start + em->len > start + len) {
+                        u64 diff = start + len - em->start;
+                        split->start = start + len;
+                        split->len = em->start + em->len - (start + len);
+                        split->bdev = em->bdev;
+                        split->flags = flags;
+                        if (compressed) {
+                                split->block_len = em->block_len;
+                                split->block_start = em->block_start;
+                                split->orig_start = em->orig_start;
+                        } else {
+                                split->block_len = split->len;
+                                split->block_start = em->block_start + diff;
+                                split->orig_start = split->start;
+                        }
+                        ret = add_extent_mapping(em_tree, split);
+                        BUG_ON(ret);
+                        free_extent_map(split);
+                        split = NULL;
+                }
+                spin_unlock(&em_tree->lock);
+                /* once for us */
+                free_extent_map(em);
+                /* once for the tree*/
+                free_extent_map(em);
+        }
+        if (split)
+                free_extent_map(split);
+        if (split2)
+                free_extent_map(split2);
+        return 0;
+}
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
+{
+        return 0;
+#if 0
+        struct btrfs_path *path;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *extent;
+        u64 last_offset = 0;
+        int nritems;
+        int slot;
+        int found_type;
+        int ret;
+        int err = 0;
+        u64 extent_end = 0;
+        path = btrfs_alloc_path();
+        ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
+                                       last_offset, 0);
+        while (1) {
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret)
+                                goto out;
+                        nritems = btrfs_header_nritems(path->nodes[0]);
+                }
+                slot = path->slots[0];
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (found_key.objectid != inode->i_ino)
+                        break;
+                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+                        goto out;
+                if (found_key.offset < last_offset) {
+                        WARN_ON(1);
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_ERR "inode %lu found offset %llu "
+                               "expected %llu\n", inode->i_ino,
+                               (unsigned long long)found_key.offset,
+                               (unsigned long long)last_offset);
+                        err = 1;
+                        goto out;
+                }
+                extent = btrfs_item_ptr(leaf, slot,
+                                        struct btrfs_file_extent_item);
+                found_type = btrfs_file_extent_type(leaf, extent);
+                if (found_type == BTRFS_FILE_EXTENT_REG) {
+                        extent_end = found_key.offset +
+                             btrfs_file_extent_num_bytes(leaf, extent);
+                } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                        struct btrfs_item *item;
+                        item = btrfs_item_nr(leaf, slot);
+                        extent_end = found_key.offset +
+                             btrfs_file_extent_inline_len(leaf, extent);
+                        extent_end = (extent_end + root->sectorsize - 1) &
+                                ~((u64)root->sectorsize - 1);
+                }
+                last_offset = extent_end;
+                path->slots[0]++;
+        }
+        if (0 && last_offset < inode->i_size) {
+                WARN_ON(1);
+                btrfs_print_leaf(root, leaf);
+                printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
+                       inode->i_ino, (unsigned long long)last_offset,
+                       (unsigned long long)inode->i_size);
+                err = 1;
+        }
+out:
+        btrfs_free_path(path);
+        return err;
+#endif
+}
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end.  hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split.  Anything entirely inside the range
+ * is deleted from the tree.
+ *
+ * inline_limit is used to tell this code which offsets in the file to keep
+ * if they contain inline extents.
+ */
+noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct inode *inode,
+                       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
+{
+        u64 extent_end = 0;
+        u64 locked_end = end;
+        u64 search_start = start;
+        u64 leaf_start;
+        u64 ram_bytes = 0;
+        u64 orig_parent = 0;
+        u64 disk_bytenr = 0;
+        u8 compression;
+        u8 encryption;
+        u16 other_encoding = 0;
+        u64 root_gen;
+        u64 root_owner;
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *extent;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item old;
+        int keep;
+        int slot;
+        int bookend;
+        int found_type = 0;
+        int found_extent;
+        int found_inline;
+        int recow;
+        int ret;
+        inline_limit = 0;
+        btrfs_drop_extent_cache(inode, start, end - 1, 0);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        while (1) {
+                recow = 0;
+                btrfs_release_path(root, path);
+                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                               search_start, -1);
+                if (ret < 0)
+                        goto out;
+                if (ret > 0) {
+                        if (path->slots[0] == 0) {
+                                ret = 0;
+                                goto out;
+                        }
+                        path->slots[0]--;
+                }
+next_slot:
+                keep = 0;
+                bookend = 0;
+                found_extent = 0;
+                found_inline = 0;
+                leaf_start = 0;
+                root_gen = 0;
+                root_owner = 0;
+                compression = 0;
+                encryption = 0;
+                extent = NULL;
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                ret = 0;
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
+                    key.offset >= end) {
+                        goto out;
+                }
+                if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+                    key.objectid != inode->i_ino) {
+                        goto out;
+                }
+                if (recow) {
+                        search_start = max(key.offset, start);
+                        continue;
+                }
+                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+                        extent = btrfs_item_ptr(leaf, slot,
+                                                struct btrfs_file_extent_item);
+                        found_type = btrfs_file_extent_type(leaf, extent);
+                        compression = btrfs_file_extent_compression(leaf,
+                                                                    extent);
+                        encryption = btrfs_file_extent_encryption(leaf,
+                                                                  extent);
+                        other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                  extent);
+                        if (found_type == BTRFS_FILE_EXTENT_REG ||
+                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                extent_end =
+                                     btrfs_file_extent_disk_bytenr(leaf,
+                                                                   extent);
+                                if (extent_end)
+                                        *hint_byte = extent_end;
+                                extent_end = key.offset +
+                                     btrfs_file_extent_num_bytes(leaf, extent);
+                                ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+                                                                extent);
+                                found_extent = 1;
+                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                                found_inline = 1;
+                                extent_end = key.offset +
+                                     btrfs_file_extent_inline_len(leaf, extent);
+                        }
+                } else {
+                        extent_end = search_start;
+                }
+                /* we found nothing we can drop */
+                if ((!found_extent && !found_inline) ||
+                    search_start >= extent_end) {
+                        int nextret;
+                        u32 nritems;
+                        nritems = btrfs_header_nritems(leaf);
+                        if (slot >= nritems - 1) {
+                                nextret = btrfs_next_leaf(root, path);
+                                if (nextret)
+                                        goto out;
+                                recow = 1;
+                        } else {
+                                path->slots[0]++;
+                        }
+                        goto next_slot;
+                }
+                if (end <= extent_end && start >= key.offset && found_inline)
+                        *hint_byte = EXTENT_MAP_INLINE;
+                if (found_extent) {
+                        read_extent_buffer(leaf, &old, (unsigned long)extent,
+                                           sizeof(old));
+                        root_gen = btrfs_header_generation(leaf);
+                        root_owner = btrfs_header_owner(leaf);
+                        leaf_start = leaf->start;
+                }
+                if (end < extent_end && end >= key.offset) {
+                        bookend = 1;
+                        if (found_inline && start <= key.offset)
+                                keep = 1;
+                }
+                if (bookend && found_extent) {
+                        if (locked_end < extent_end) {
+                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                                                locked_end, extent_end - 1,
+                                                GFP_NOFS);
+                                if (!ret) {
+                                        btrfs_release_path(root, path);
+                                        lock_extent(&BTRFS_I(inode)->io_tree,
+                                                locked_end, extent_end - 1,
+                                                GFP_NOFS);
+                                        locked_end = extent_end;
+                                        continue;
+                                }
+                                locked_end = extent_end;
+                        }
+                        orig_parent = path->nodes[0]->start;
+                        disk_bytenr = le64_to_cpu(old.disk_bytenr);
+                        if (disk_bytenr != 0) {
+                                ret = btrfs_inc_extent_ref(trans, root,
+                                           disk_bytenr,
+                                           le64_to_cpu(old.disk_num_bytes),
+                                           orig_parent, root->root_key.objectid,
+                                           trans->transid, inode->i_ino);
+                                BUG_ON(ret);
+                        }
+                }
+                if (found_inline) {
+                        u64 mask = root->sectorsize - 1;
+                        search_start = (extent_end + mask) & ~mask;
+                } else
+                        search_start = extent_end;
+                /* truncate existing extent */
+                if (start > key.offset) {
+                        u64 new_num;
+                        u64 old_num;
+                        keep = 1;
+                        WARN_ON(start & (root->sectorsize - 1));
+                        if (found_extent) {
+                                new_num = start - key.offset;
+                                old_num = btrfs_file_extent_num_bytes(leaf,
+                                                                      extent);
+                                *hint_byte =
+                                        btrfs_file_extent_disk_bytenr(leaf,
+                                                                      extent);
+                                if (btrfs_file_extent_disk_bytenr(leaf,
+                                                                  extent)) {
+                                        inode_sub_bytes(inode, old_num -
+                                                        new_num);
+                                }
+                                btrfs_set_file_extent_num_bytes(leaf,
+                                                        extent, new_num);
+                                btrfs_mark_buffer_dirty(leaf);
+                        } else if (key.offset < inline_limit &&
+                                   (end > extent_end) &&
+                                   (inline_limit < extent_end)) {
+                                u32 new_size;
+                                new_size = btrfs_file_extent_calc_inline_size(
+                                                   inline_limit - key.offset);
+                                inode_sub_bytes(inode, extent_end -
+                                                inline_limit);
+                                btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                        new_size);
+                                if (!compression && !encryption) {
+                                        btrfs_truncate_item(trans, root, path,
+                                                            new_size, 1);
+                                }
+                        }
+                }
+                /* delete the entire extent */
+                if (!keep) {
+                        if (found_inline)
+                                inode_sub_bytes(inode, extent_end -
+                                                key.offset);
+                        ret = btrfs_del_item(trans, root, path);
+                        /* TODO update progress marker and return */
+                        BUG_ON(ret);
+                        extent = NULL;
+                        btrfs_release_path(root, path);
+                        /* the extent will be freed later */
+                }
+                if (bookend && found_inline && start <= key.offset) {
+                        u32 new_size;
+                        new_size = btrfs_file_extent_calc_inline_size(
+                                                   extent_end - end);
+                        inode_sub_bytes(inode, end - key.offset);
+                        btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                        new_size);
+                        if (!compression && !encryption)
+                                ret = btrfs_truncate_item(trans, root, path,
+                                                          new_size, 0);
+                        BUG_ON(ret);
+                }
+                /* create bookend, splitting the extent in two */
+                if (bookend && found_extent) {
+                        struct btrfs_key ins;
+                        ins.objectid = inode->i_ino;
+                        ins.offset = end;
+                        btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+                        btrfs_release_path(root, path);
+                        ret = btrfs_insert_empty_item(trans, root, path, &ins,
+                                                      sizeof(*extent));
+                        BUG_ON(ret);
+                        leaf = path->nodes[0];
+                        extent = btrfs_item_ptr(leaf, path->slots[0],
+                                                struct btrfs_file_extent_item);
+                        write_extent_buffer(leaf, &old,
+                                            (unsigned long)extent, sizeof(old));
+                        btrfs_set_file_extent_compression(leaf, extent,
+                                                          compression);
+                        btrfs_set_file_extent_encryption(leaf, extent,
+                                                         encryption);
+                        btrfs_set_file_extent_other_encoding(leaf, extent,
+                                                             other_encoding);
+                        btrfs_set_file_extent_offset(leaf, extent,
+                                    le64_to_cpu(old.offset) + end - key.offset);
+                        WARN_ON(le64_to_cpu(old.num_bytes) <
+                                (extent_end - end));
+                        btrfs_set_file_extent_num_bytes(leaf, extent,
+                                                        extent_end - end);
+                        /*
+                         * set the ram bytes to the size of the full extent
+                         * before splitting.  This is a worst case flag,
+                         * but its the best we can do because we don't know
+                         * how splitting affects compression
+                         */
+                        btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                        ram_bytes);
+                        btrfs_set_file_extent_type(leaf, extent, found_type);
+                        btrfs_mark_buffer_dirty(path->nodes[0]);
+                        if (disk_bytenr != 0) {
+                                ret = btrfs_update_extent_ref(trans, root,
+                                                disk_bytenr, orig_parent,
+                                                leaf->start,
+                                                root->root_key.objectid,
+                                                trans->transid, ins.objectid);
+                                BUG_ON(ret);
+                        }
+                        btrfs_release_path(root, path);
+                        if (disk_bytenr != 0)
+                                inode_add_bytes(inode, extent_end - end);
+                }
+                if (found_extent && !keep) {
+                        u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
+                        if (old_disk_bytenr != 0) {
+                                inode_sub_bytes(inode,
+                                                le64_to_cpu(old.num_bytes));
+                                ret = btrfs_free_extent(trans, root,
+                                                old_disk_bytenr,
+                                                le64_to_cpu(old.disk_num_bytes),
+                                                leaf_start, root_owner,
+                                                root_gen, key.objectid, 0);
+                                BUG_ON(ret);
+                                *hint_byte = old_disk_bytenr;
+                        }
+                }
+                if (search_start >= end) {
+                        ret = 0;
+                        goto out;
+                }
+        }
+out:
+        btrfs_free_path(path);
+        if (locked_end > end) {
+                unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+                              GFP_NOFS);
+        }
+        btrfs_check_file(root, inode);
+        return ret;
+}
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+                            u64 objectid, u64 bytenr, u64 *start, u64 *end)
+{
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 extent_end;
+        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+                return 0;
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+                return 0;
+        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+            btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+            btrfs_file_extent_compression(leaf, fi) ||
+            btrfs_file_extent_encryption(leaf, fi) ||
+            btrfs_file_extent_other_encoding(leaf, fi))
+                return 0;
+        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+        if ((*start && *start != key.offset) || (*end && *end != extent_end))
+                return 0;
+        *start = key.offset;
+        *end = extent_end;
+        return 1;
+}
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct inode *inode, u64 start, u64 end)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 bytenr;
+        u64 num_bytes;
+        u64 extent_end;
+        u64 extent_offset;
+        u64 other_start;
+        u64 other_end;
+        u64 split = start;
+        u64 locked_end = end;
+        u64 orig_parent;
+        int extent_type;
+        int split_end = 1;
+        int ret;
+        btrfs_drop_extent_cache(inode, start, end - 1, 0);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+again:
+        key.objectid = inode->i_ino;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        if (split == start)
+                key.offset = split;
+        else
+                key.offset = split - 1;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0 && path->slots[0] > 0)
+                path->slots[0]--;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        BUG_ON(key.objectid != inode->i_ino ||
+               key.type != BTRFS_EXTENT_DATA_KEY);
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        extent_type = btrfs_file_extent_type(leaf, fi);
+        BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+        BUG_ON(key.offset > start || extent_end < end);
+        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+        extent_offset = btrfs_file_extent_offset(leaf, fi);
+        if (key.offset == start)
+                split = end;
+        if (key.offset == start && extent_end == end) {
+                int del_nr = 0;
+                int del_slot = 0;
+                u64 leaf_owner = btrfs_header_owner(leaf);
+                u64 leaf_gen = btrfs_header_generation(leaf);
+                other_start = end;
+                other_end = 0;
+                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+                                     bytenr, &other_start, &other_end)) {
+                        extent_end = other_end;
+                        del_slot = path->slots[0] + 1;
+                        del_nr++;
+                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                                leaf->start, leaf_owner,
+                                                leaf_gen, inode->i_ino, 0);
+                        BUG_ON(ret);
+                }
+                other_start = 0;
+                other_end = start;
+                if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+                                     bytenr, &other_start, &other_end)) {
+                        key.offset = other_start;
+                        del_slot = path->slots[0];
+                        del_nr++;
+                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                                leaf->start, leaf_owner,
+                                                leaf_gen, inode->i_ino, 0);
+                        BUG_ON(ret);
+                }
+                split_end = 0;
+                if (del_nr == 0) {
+                        btrfs_set_file_extent_type(leaf, fi,
+                                                   BTRFS_FILE_EXTENT_REG);
+                        goto done;
+                }
+                fi = btrfs_item_ptr(leaf, del_slot - 1,
+                                    struct btrfs_file_extent_item);
+                btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+                btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                extent_end - key.offset);
+                btrfs_mark_buffer_dirty(leaf);
+                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+                BUG_ON(ret);
+                goto done;
+        } else if (split == start) {
+                if (locked_end < extent_end) {
+                        ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                                        locked_end, extent_end - 1, GFP_NOFS);
+                        if (!ret) {
+                                btrfs_release_path(root, path);
+                                lock_extent(&BTRFS_I(inode)->io_tree,
+                                        locked_end, extent_end - 1, GFP_NOFS);
+                                locked_end = extent_end;
+                                goto again;
+                        }
+                        locked_end = extent_end;
+                }
+                btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
+                extent_offset += split - key.offset;
+        } else  {
+                BUG_ON(key.offset != start);
+                btrfs_set_file_extent_offset(leaf, fi, extent_offset +
+                                             split - key.offset);
+                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
+                key.offset = split;
+                btrfs_set_item_key_safe(trans, root, path, &key);
+                extent_end = split;
+        }
+        if (extent_end == end) {
+                split_end = 0;
+                extent_type = BTRFS_FILE_EXTENT_REG;
+        }
+        if (extent_end == end && split == start) {
+                other_start = end;
+                other_end = 0;
+                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+                                     bytenr, &other_start, &other_end)) {
+                        path->slots[0]++;
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        key.offset = split;
+                        btrfs_set_item_key_safe(trans, root, path, &key);
+                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        other_end - split);
+                        goto done;
+                }
+        }
+        if (extent_end == end && split == end) {
+                other_start = 0;
+                other_end = start;
+                if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
+                                     bytenr, &other_start, &other_end)) {
+                        path->slots[0]--;
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
+                                                        other_start);
+                        goto done;
+                }
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        orig_parent = leaf->start;
+        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+                                   orig_parent, root->root_key.objectid,
+                                   trans->transid, inode->i_ino);
+        BUG_ON(ret);
+        btrfs_release_path(root, path);
+        key.offset = start;
+        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+        btrfs_set_file_extent_type(leaf, fi, extent_type);
+        btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
+        btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
+        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
+        btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+        btrfs_set_file_extent_compression(leaf, fi, 0);
+        btrfs_set_file_extent_encryption(leaf, fi, 0);
+        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+        if (orig_parent != leaf->start) {
+                ret = btrfs_update_extent_ref(trans, root, bytenr,
+                                              orig_parent, leaf->start,
+                                              root->root_key.objectid,
+                                              trans->transid, inode->i_ino);
+                BUG_ON(ret);
+        }
+done:
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+        if (split_end && split == start) {
+                split = end;
+                goto again;
+        }
+        if (locked_end > end) {
+                unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+                              GFP_NOFS);
+        }
+        btrfs_free_path(path);
+        return 0;
+}
+/*
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
+ */
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
+                         struct page **pages, size_t num_pages,
+                         loff_t pos, unsigned long first_index,
+                         unsigned long last_index, size_t write_bytes)
+{
+        int i;
+        unsigned long index = pos >> PAGE_CACHE_SHIFT;
+        struct inode *inode = fdentry(file)->d_inode;
+        int err = 0;
+        u64 start_pos;
+        u64 last_pos;
+        start_pos = pos & ~((u64)root->sectorsize - 1);
+        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+        if (start_pos > inode->i_size) {
+                err = btrfs_cont_expand(inode, start_pos);
+                if (err)
+                        return err;
+        }
+        memset(pages, 0, num_pages * sizeof(struct page *));
+again:
+        for (i = 0; i < num_pages; i++) {
+                pages[i] = grab_cache_page(inode->i_mapping, index + i);
+                if (!pages[i]) {
+                        err = -ENOMEM;
+                        BUG_ON(1);
+                }
+                wait_on_page_writeback(pages[i]);
+        }
+        if (start_pos < inode->i_size) {
+                struct btrfs_ordered_extent *ordered;
+                lock_extent(&BTRFS_I(inode)->io_tree,
+                            start_pos, last_pos - 1, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            last_pos - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > start_pos &&
+                    ordered->file_offset < last_pos) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent(&BTRFS_I(inode)->io_tree,
+                                      start_pos, last_pos - 1, GFP_NOFS);
+                        for (i = 0; i < num_pages; i++) {
+                                unlock_page(pages[i]);
+                                page_cache_release(pages[i]);
+                        }
+                        btrfs_wait_ordered_range(inode, start_pos,
+                                                 last_pos - start_pos);
+                        goto again;
+                }
+                if (ordered)
+                        btrfs_put_ordered_extent(ordered);
+                clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+                                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+                                  GFP_NOFS);
+                unlock_extent(&BTRFS_I(inode)->io_tree,
+                              start_pos, last_pos - 1, GFP_NOFS);
+        }
+        for (i = 0; i < num_pages; i++) {
+                clear_page_dirty_for_io(pages[i]);
+                set_page_extent_mapped(pages[i]);
+                WARN_ON(!PageLocked(pages[i]));
+        }
+        return 0;
+}
+static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+        loff_t pos;
+        loff_t start_pos;
+        ssize_t num_written = 0;
+        ssize_t err = 0;
+        int ret = 0;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct page **pages = NULL;
+        int nrptrs;
+        struct page *pinned[2];
+        unsigned long first_index;
+        unsigned long last_index;
+        int will_write;
+        will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+                      (file->f_flags & O_DIRECT));
+        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
+                     PAGE_CACHE_SIZE / (sizeof(struct page *)));
+        pinned[0] = NULL;
+        pinned[1] = NULL;
+        pos = *ppos;
+        start_pos = pos;
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        current->backing_dev_info = inode->i_mapping->backing_dev_info;
+        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+        if (err)
+                goto out_nolock;
+        if (count == 0)
+                goto out_nolock;
+        err = file_remove_suid(file);
+        if (err)
+                goto out_nolock;
+        file_update_time(file);
+        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+        mutex_lock(&inode->i_mutex);
+        BTRFS_I(inode)->sequence++;
+        first_index = pos >> PAGE_CACHE_SHIFT;
+        last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+        /*
+         * there are lots of better ways to do this, but this code
+         * makes sure the first and last page in the file range are
+         * up to date and ready for cow
+         */
+        if ((pos & (PAGE_CACHE_SIZE - 1))) {
+                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
+                if (!PageUptodate(pinned[0])) {
+                        ret = btrfs_readpage(NULL, pinned[0]);
+                        BUG_ON(ret);
+                        wait_on_page_locked(pinned[0]);
+                } else {
+                        unlock_page(pinned[0]);
+                }
+        }
+        if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
+                if (!PageUptodate(pinned[1])) {
+                        ret = btrfs_readpage(NULL, pinned[1]);
+                        BUG_ON(ret);
+                        wait_on_page_locked(pinned[1]);
+                } else {
+                        unlock_page(pinned[1]);
+                }
+        }
+        while (count > 0) {
+                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+                size_t write_bytes = min(count, nrptrs *
+                                        (size_t)PAGE_CACHE_SIZE -
+                                         offset);
+                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+                                        PAGE_CACHE_SHIFT;
+                WARN_ON(num_pages > nrptrs);
+                memset(pages, 0, sizeof(struct page *) * nrptrs);
+                ret = btrfs_check_free_space(root, write_bytes, 0);
+                if (ret)
+                        goto out;
+                ret = prepare_pages(root, file, pages, num_pages,
+                                    pos, first_index, last_index,
+                                    write_bytes);
+                if (ret)
+                        goto out;
+                ret = btrfs_copy_from_user(pos, num_pages,
+                                           write_bytes, pages, buf);
+                if (ret) {
+                        btrfs_drop_pages(pages, num_pages);
+                        goto out;
+                }
+                ret = dirty_and_release_pages(NULL, root, file, pages,
+                                              num_pages, pos, write_bytes);
+                btrfs_drop_pages(pages, num_pages);
+                if (ret)
+                        goto out;
+                if (will_write) {
+                        btrfs_fdatawrite_range(inode->i_mapping, pos,
+                                               pos + write_bytes - 1,
+                                               WB_SYNC_NONE);
+                } else {
+                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                                                           num_pages);
+                        if (num_pages <
+                            (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                                btrfs_btree_balance_dirty(root, 1);
+                        btrfs_throttle(root);
+                }
+                buf += write_bytes;
+                count -= write_bytes;
+                pos += write_bytes;
+                num_written += write_bytes;
+                cond_resched();
+        }
+out:
+        mutex_unlock(&inode->i_mutex);
+out_nolock:
+        kfree(pages);
+        if (pinned[0])
+                page_cache_release(pinned[0]);
+        if (pinned[1])
+                page_cache_release(pinned[1]);
+        *ppos = pos;
+        if (num_written > 0 && will_write) {
+                struct btrfs_trans_handle *trans;
+                err = btrfs_wait_ordered_range(inode, start_pos, num_written);
+                if (err)
+                        num_written = err;
+                if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+                        trans = btrfs_start_transaction(root, 1);
+                        ret = btrfs_log_dentry_safe(trans, root,
+                                                    file->f_dentry);
+                        if (ret == 0) {
+                                btrfs_sync_log(trans, root);
+                                btrfs_end_transaction(trans, root);
+                        } else {
+                                btrfs_commit_transaction(trans, root);
+                        }
+                }
+                if (file->f_flags & O_DIRECT) {
+                        invalidate_mapping_pages(inode->i_mapping,
+                              start_pos >> PAGE_CACHE_SHIFT,
+                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
+                }
+        }
+        current->backing_dev_info = NULL;
+        return num_written ? num_written : err;
+}
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+        if (filp->private_data)
+                btrfs_ioctl_trans_end(filp);
+        return 0;
+}
+/*
+ * fsync call for both files and directories.  This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit.  This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        struct btrfs_trans_handle *trans;
+        /*
+         * check the transaction that last modified this inode
+         * and see if its already been committed
+         */
+        if (!BTRFS_I(inode)->last_trans)
+                goto out;
+        mutex_lock(&root->fs_info->trans_mutex);
+        if (BTRFS_I(inode)->last_trans <=
+            root->fs_info->last_trans_committed) {
+                BTRFS_I(inode)->last_trans = 0;
+                mutex_unlock(&root->fs_info->trans_mutex);
+                goto out;
+        }
+        mutex_unlock(&root->fs_info->trans_mutex);
+        root->fs_info->tree_log_batch++;
+        filemap_fdatawrite(inode->i_mapping);
+        btrfs_wait_ordered_range(inode, 0, (u64)-1);
+        root->fs_info->tree_log_batch++;
+        /*
+         * ok we haven't committed the transaction yet, lets do a commit
+         */
+        if (file->private_data)
+                btrfs_ioctl_trans_end(file);
+        trans = btrfs_start_transaction(root, 1);
+        if (!trans) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+        if (ret < 0)
+                goto out;
+        /* we've logged all the items and now have a consistent
+         * version of the file in the log.  It is possible that
+         * someone will come in and modify the file, but that's
+         * fine because the log is consistent on disk, and we
+         * have references to all of the file's extents
+         *
+         * It is possible that someone will come in and log the
+         * file again, but that will end up using the synchronization
+         * inside btrfs_sync_log to keep things safe.
+         */
+        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+        if (ret > 0) {
+                ret = btrfs_commit_transaction(trans, root);
+        } else {
+                btrfs_sync_log(trans, root);
+                ret = btrfs_end_transaction(trans, root);
+        }
+        mutex_lock(&file->f_dentry->d_inode->i_mutex);
+out:
+        return ret > 0 ? EIO : ret;
+}
+static struct vm_operations_struct btrfs_file_vm_ops = {
+        .fault          = filemap_fault,
+        .page_mkwrite   = btrfs_page_mkwrite,
+};
+static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
+{
+        vma->vm_ops = &btrfs_file_vm_ops;
+        file_accessed(filp);
+        return 0;
+}
+struct file_operations btrfs_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .aio_read       = generic_file_aio_read,
+        .splice_read    = generic_file_splice_read,
+        .write          = btrfs_file_write,
+        .mmap           = btrfs_file_mmap,
+        .open           = generic_file_open,
+        .release        = btrfs_release_file,
+        .fsync          = btrfs_sync_file,
+        .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = btrfs_ioctl,
+#endif
+};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..d1e5f0e84c58
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,495 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+                              struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct btrfs_free_space *info;
+        while (*p) {
+                parent = *p;
+                info = rb_entry(parent, struct btrfs_free_space, offset_index);
+                if (offset < info->offset)
+                        p = &(*p)->rb_left;
+                else if (offset > info->offset)
+                        p = &(*p)->rb_right;
+                else
+                        return -EEXIST;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return 0;
+}
+static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+                             struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct btrfs_free_space *info;
+        while (*p) {
+                parent = *p;
+                info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+                if (bytes < info->bytes)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return 0;
+}
+/*
+ * searches the tree for the given offset.  If contains is set we will return
+ * the free space that contains the given offset.  If contains is not set we
+ * will return the free space that starts at or after the given offset and is
+ * at least bytes long.
+ */
+static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+                                                   u64 offset, u64 bytes,
+                                                   int contains)
+{
+        struct rb_node *n = root->rb_node;
+        struct btrfs_free_space *entry, *ret = NULL;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                if (offset < entry->offset) {
+                        if (!contains &&
+                            (!ret || entry->offset < ret->offset) &&
+                            (bytes <= entry->bytes))
+                                ret = entry;
+                        n = n->rb_left;
+                } else if (offset > entry->offset) {
+                        if ((entry->offset + entry->bytes - 1) >= offset &&
+                            bytes <= entry->bytes) {
+                                ret = entry;
+                                break;
+                        }
+                        n = n->rb_right;
+                } else {
+                        if (bytes > entry->bytes) {
+                                n = n->rb_right;
+                                continue;
+                        }
+                        ret = entry;
+                        break;
+                }
+        }
+        return ret;
+}
+/*
+ * return a chunk at least bytes size, as close to offset that we can get.
+ */
+static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
+                                                  u64 offset, u64 bytes)
+{
+        struct rb_node *n = root->rb_node;
+        struct btrfs_free_space *entry, *ret = NULL;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+                if (bytes < entry->bytes) {
+                        /*
+                         * We prefer to get a hole size as close to the size we
+                         * are asking for so we don't take small slivers out of
+                         * huge holes, but we also want to get as close to the
+                         * offset as possible so we don't have a whole lot of
+                         * fragmentation.
+                         */
+                        if (offset <= entry->offset) {
+                                if (!ret)
+                                        ret = entry;
+                                else if (entry->bytes < ret->bytes)
+                                        ret = entry;
+                                else if (entry->offset < ret->offset)
+                                        ret = entry;
+                        }
+                        n = n->rb_left;
+                } else if (bytes > entry->bytes) {
+                        n = n->rb_right;
+                } else {
+                        /*
+                         * Ok we may have multiple chunks of the wanted size,
+                         * so we don't want to take the first one we find, we
+                         * want to take the one closest to our given offset, so
+                         * keep searching just in case theres a better match.
+                         */
+                        n = n->rb_right;
+                        if (offset > entry->offset)
+                                continue;
+                        else if (!ret || entry->offset < ret->offset)
+                                ret = entry;
+                }
+        }
+        return ret;
+}
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info)
+{
+        rb_erase(&info->offset_index, &block_group->free_space_offset);
+        rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+}
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_free_space *info)
+{
+        int ret = 0;
+        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+                                 &info->offset_index);
+        if (ret)
+                return ret;
+        ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+                                &info->bytes_index);
+        if (ret)
+                return ret;
+        return ret;
+}
+static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                                  u64 offset, u64 bytes)
+{
+        struct btrfs_free_space *right_info;
+        struct btrfs_free_space *left_info;
+        struct btrfs_free_space *info = NULL;
+        struct btrfs_free_space *alloc_info;
+        int ret = 0;
+        alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+        if (!alloc_info)
+                return -ENOMEM;
+        /*
+         * first we want to see if there is free space adjacent to the range we
+         * are adding, if there is remove that struct and add a new one to
+         * cover the entire range
+         */
+        right_info = tree_search_offset(&block_group->free_space_offset,
+                                        offset+bytes, 0, 1);
+        left_info = tree_search_offset(&block_group->free_space_offset,
+                                       offset-1, 0, 1);
+        if (right_info && right_info->offset == offset+bytes) {
+                unlink_free_space(block_group, right_info);
+                info = right_info;
+                info->offset = offset;
+                info->bytes += bytes;
+        } else if (right_info && right_info->offset != offset+bytes) {
+                printk(KERN_ERR "btrfs adding space in the middle of an "
+                       "existing free space area. existing: "
+                       "offset=%llu, bytes=%llu. new: offset=%llu, "
+                       "bytes=%llu\n", (unsigned long long)right_info->offset,
+                       (unsigned long long)right_info->bytes,
+                       (unsigned long long)offset,
+                       (unsigned long long)bytes);
+                BUG();
+        }
+        if (left_info) {
+                unlink_free_space(block_group, left_info);
+                if (unlikely((left_info->offset + left_info->bytes) !=
+                             offset)) {
+                        printk(KERN_ERR "btrfs free space to the left "
+                               "of new free space isn't "
+                               "quite right. existing: offset=%llu, "
+                               "bytes=%llu. new: offset=%llu, bytes=%llu\n",
+                               (unsigned long long)left_info->offset,
+                               (unsigned long long)left_info->bytes,
+                               (unsigned long long)offset,
+                               (unsigned long long)bytes);
+                        BUG();
+                }
+                if (info) {
+                        info->offset = left_info->offset;
+                        info->bytes += left_info->bytes;
+                        kfree(left_info);
+                } else {
+                        info = left_info;
+                        info->bytes += bytes;
+                }
+        }
+        if (info) {
+                ret = link_free_space(block_group, info);
+                if (!ret)
+                        info = NULL;
+                goto out;
+        }
+        info = alloc_info;
+        alloc_info = NULL;
+        info->offset = offset;
+        info->bytes = bytes;
+        ret = link_free_space(block_group, info);
+        if (ret)
+                kfree(info);
+out:
+        if (ret) {
+                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+                if (ret == -EEXIST)
+                        BUG();
+        }
+        kfree(alloc_info);
+        return ret;
+}
+static int
+__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                          u64 offset, u64 bytes)
+{
+        struct btrfs_free_space *info;
+        int ret = 0;
+        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+                                  1);
+        if (info && info->offset == offset) {
+                if (info->bytes < bytes) {
+                        printk(KERN_ERR "Found free space at %llu, size %llu,"
+                               "trying to use %llu\n",
+                               (unsigned long long)info->offset,
+                               (unsigned long long)info->bytes,
+                               (unsigned long long)bytes);
+                        WARN_ON(1);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                unlink_free_space(block_group, info);
+                if (info->bytes == bytes) {
+                        kfree(info);
+                        goto out;
+                }
+                info->offset += bytes;
+                info->bytes -= bytes;
+                ret = link_free_space(block_group, info);
+                BUG_ON(ret);
+        } else if (info && info->offset < offset &&
+                   info->offset + info->bytes >= offset + bytes) {
+                u64 old_start = info->offset;
+                /*
+                 * we're freeing space in the middle of the info,
+                 * this can happen during tree log replay
+                 *
+                 * first unlink the old info and then
+                 * insert it again after the hole we're creating
+                 */
+                unlink_free_space(block_group, info);
+                if (offset + bytes < info->offset + info->bytes) {
+                        u64 old_end = info->offset + info->bytes;
+                        info->offset = offset + bytes;
+                        info->bytes = old_end - info->offset;
+                        ret = link_free_space(block_group, info);
+                        BUG_ON(ret);
+                } else {
+                        /* the hole we're creating ends at the end
+                         * of the info struct, just free the info
+                         */
+                        kfree(info);
+                }
+                /* step two, insert a new info struct to cover anything
+                 * before the hole
+                 */
+                ret = __btrfs_add_free_space(block_group, old_start,
+                                             offset - old_start);
+                BUG_ON(ret);
+        } else {
+                WARN_ON(1);
+        }
+out:
+        return ret;
+}
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                         u64 offset, u64 bytes)
+{
+        int ret;
+        struct btrfs_free_space *sp;
+        mutex_lock(&block_group->alloc_mutex);
+        ret = __btrfs_add_free_space(block_group, offset, bytes);
+        sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+        BUG_ON(!sp);
+        mutex_unlock(&block_group->alloc_mutex);
+        return ret;
+}
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+                              u64 offset, u64 bytes)
+{
+        int ret;
+        struct btrfs_free_space *sp;
+        ret = __btrfs_add_free_space(block_group, offset, bytes);
+        sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+        BUG_ON(!sp);
+        return ret;
+}
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                            u64 offset, u64 bytes)
+{
+        int ret = 0;
+        mutex_lock(&block_group->alloc_mutex);
+        ret = __btrfs_remove_free_space(block_group, offset, bytes);
+        mutex_unlock(&block_group->alloc_mutex);
+        return ret;
+}
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+                                 u64 offset, u64 bytes)
+{
+        int ret;
+        ret = __btrfs_remove_free_space(block_group, offset, bytes);
+        return ret;
+}
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+                           u64 bytes)
+{
+        struct btrfs_free_space *info;
+        struct rb_node *n;
+        int count = 0;
+        for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+                info = rb_entry(n, struct btrfs_free_space, offset_index);
+                if (info->bytes >= bytes)
+                        count++;
+        }
+        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
+               "\n", count);
+}
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_free_space *info;
+        struct rb_node *n;
+        u64 ret = 0;
+        for (n = rb_first(&block_group->free_space_offset); n;
+             n = rb_next(n)) {
+                info = rb_entry(n, struct btrfs_free_space, offset_index);
+                ret += info->bytes;
+        }
+        return ret;
+}
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_free_space *info;
+        struct rb_node *node;
+        mutex_lock(&block_group->alloc_mutex);
+        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+                info = rb_entry(node, struct btrfs_free_space, bytes_index);
+                unlink_free_space(block_group, info);
+                kfree(info);
+                if (need_resched()) {
+                        mutex_unlock(&block_group->alloc_mutex);
+                        cond_resched();
+                        mutex_lock(&block_group->alloc_mutex);
+                }
+        }
+        mutex_unlock(&block_group->alloc_mutex);
+}
+#if 0
+static struct btrfs_free_space *btrfs_find_free_space_offset(struct
+                                                      btrfs_block_group_cache
+                                                      *block_group, u64 offset,
+                                                      u64 bytes)
+{
+        struct btrfs_free_space *ret;
+        mutex_lock(&block_group->alloc_mutex);
+        ret = tree_search_offset(&block_group->free_space_offset, offset,
+                                 bytes, 0);
+        mutex_unlock(&block_group->alloc_mutex);
+        return ret;
+}
+static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+                                                     btrfs_block_group_cache
+                                                     *block_group, u64 offset,
+                                                     u64 bytes)
+{
+        struct btrfs_free_space *ret;
+        mutex_lock(&block_group->alloc_mutex);
+        ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
+        mutex_unlock(&block_group->alloc_mutex);
+        return ret;
+}
+#endif
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+                                               *block_group, u64 offset,
+                                               u64 bytes)
+{
+        struct btrfs_free_space *ret = NULL;
+        ret = tree_search_offset(&block_group->free_space_offset, offset,
+                                 bytes, 0);
+        if (!ret)
+                ret = tree_search_bytes(&block_group->free_space_bytes,
+                                        offset, bytes);
+        return ret;
+}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __HASH__
+#define __HASH__
+#include "crc32c.h"
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+        return btrfs_crc32c((u32)~1, name, len);
+}
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..3d46fa1f29a4
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
+                         int name_len, struct btrfs_inode_ref **ref_ret)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_inode_ref *ref;
+        unsigned long ptr;
+        unsigned long name_ptr;
+        u32 item_size;
+        u32 cur_offset = 0;
+        int len;
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+        while (cur_offset < item_size) {
+                ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+                len = btrfs_inode_ref_name_len(leaf, ref);
+                name_ptr = (unsigned long)(ref + 1);
+                cur_offset += len + sizeof(*ref);
+                if (len != name_len)
+                        continue;
+                if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
+                        *ref_ret = ref;
+                        return 1;
+                }
+        }
+        return 0;
+}
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           const char *name, int name_len,
+                           u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_inode_ref *ref;
+        struct extent_buffer *leaf;
+        unsigned long ptr;
+        unsigned long item_start;
+        u32 item_size;
+        u32 sub_item_len;
+        int ret;
+        int del_len = name_len + sizeof(*ref);
+        key.objectid = inode_objectid;
+        key.offset = ref_objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        } else if (ret < 0) {
+                goto out;
+        }
+        if (!find_name_in_backref(path, name, name_len, &ref)) {
+                ret = -ENOENT;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        if (index)
+                *index = btrfs_inode_ref_index(leaf, ref);
+        if (del_len == item_size) {
+                ret = btrfs_del_item(trans, root, path);
+                goto out;
+        }
+        ptr = (unsigned long)ref;
+        sub_item_len = name_len + sizeof(*ref);
+        item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+        memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+                              item_size - (ptr + sub_item_len - item_start));
+        ret = btrfs_truncate_item(trans, root, path,
+                                  item_size - sub_item_len, 1);
+        BUG_ON(ret);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           const char *name, int name_len,
+                           u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_inode_ref *ref;
+        unsigned long ptr;
+        int ret;
+        int ins_len = name_len + sizeof(*ref);
+        key.objectid = inode_objectid;
+        key.offset = ref_objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      ins_len);
+        if (ret == -EEXIST) {
+                u32 old_size;
+                if (find_name_in_backref(path, name, name_len, &ref))
+                        goto out;
+                old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+                ret = btrfs_extend_item(trans, root, path, ins_len);
+                BUG_ON(ret);
+                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                     struct btrfs_inode_ref);
+                ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+                btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+                ptr = (unsigned long)(ref + 1);
+                ret = 0;
+        } else if (ret < 0) {
+                goto out;
+        } else {
+                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                     struct btrfs_inode_ref);
+                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+                btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+                ptr = (unsigned long)(ref + 1);
+        }
+        write_extent_buffer(path->nodes[0], name, ptr, name_len);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      sizeof(struct btrfs_inode_item));
+        if (ret == 0 && objectid > root->highest_inode)
+                root->highest_inode = objectid;
+        return ret;
+}
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+                       *root, struct btrfs_path *path,
+                       struct btrfs_key *location, int mod)
+{
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        int ret;
+        int slot;
+        struct extent_buffer *leaf;
+        struct btrfs_key found_key;
+        ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+        if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+            location->offset == (u64)-1 && path->slots[0] != 0) {
+                slot = path->slots[0] - 1;
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (found_key.objectid == location->objectid &&
+                    btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+                        path->slots[0]--;
+                        return 0;
+                }
+        }
+        return ret;
+}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..2aa79873eb46
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct extent_buffer *l;
+        struct btrfs_key search_key;
+        struct btrfs_key found_key;
+        int slot;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+        search_key.type = -1;
+        search_key.offset = (u64)-1;
+        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        BUG_ON(ret == 0);
+        if (path->slots[0] > 0) {
+                slot = path->slots[0] - 1;
+                l = path->nodes[0];
+                btrfs_item_key_to_cpu(l, &found_key, slot);
+                *objectid = found_key.objectid;
+        } else {
+                *objectid = BTRFS_FIRST_FREE_OBJECTID;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * walks the btree of allocated inodes and find a hole.
+ */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             u64 dirid, u64 *objectid)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret;
+        int slot = 0;
+        u64 last_ino = 0;
+        int start_found;
+        struct extent_buffer *l;
+        struct btrfs_key search_key;
+        u64 search_start = dirid;
+        mutex_lock(&root->objectid_mutex);
+        if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
+            root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
+                *objectid = ++root->last_inode_alloc;
+                mutex_unlock(&root->objectid_mutex);
+                return 0;
+        }
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
+        search_key.objectid = search_start;
+        search_key.type = 0;
+        search_key.offset = 0;
+        btrfs_init_path(path);
+        start_found = 0;
+        ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        while (1) {
+                l = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(l)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto error;
+                        if (!start_found) {
+                                *objectid = search_start;
+                                start_found = 1;
+                                goto found;
+                        }
+                        *objectid = last_ino > search_start ?
+                                last_ino : search_start;
+                        goto found;
+                }
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid >= search_start) {
+                        if (start_found) {
+                                if (last_ino < search_start)
+                                        last_ino = search_start;
+                                if (key.objectid > last_ino) {
+                                        *objectid = last_ino;
+                                        goto found;
+                                }
+                        } else if (key.objectid > search_start) {
+                                *objectid = search_start;
+                                goto found;
+                        }
+                }
+                if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
+                        break;
+                start_found = 1;
+                last_ino = key.objectid + 1;
+                path->slots[0]++;
+        }
+        BUG_ON(1);
+found:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        BUG_ON(*objectid < search_start);
+        mutex_unlock(&root->objectid_mutex);
+        return 0;
+error:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        mutex_unlock(&root->objectid_mutex);
+        return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..8adfe059ab41
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5035 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/falloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "xattr.h"
+#include "tree-log.h"
+#include "ref-cache.h"
+#include "compression.h"
+struct btrfs_iget_args {
+        u64 ino;
+        struct btrfs_root *root;
+};
+static struct inode_operations btrfs_dir_inode_operations;
+static struct inode_operations btrfs_symlink_inode_operations;
+static struct inode_operations btrfs_dir_ro_inode_operations;
+static struct inode_operations btrfs_special_inode_operations;
+static struct inode_operations btrfs_file_inode_operations;
+static struct address_space_operations btrfs_aops;
+static struct address_space_operations btrfs_symlink_aops;
+static struct file_operations btrfs_dir_file_operations;
+static struct extent_io_ops btrfs_extent_io_ops;
+static struct kmem_cache *btrfs_inode_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_bit_radix_cachep;
+struct kmem_cache *btrfs_path_cachep;
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+        [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
+};
+static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static noinline int cow_file_range(struct inode *inode,
+                                   struct page *locked_page,
+                                   u64 start, u64 end, int *page_started,
+                                   unsigned long *nr_written, int unlock);
+/*
+ * a very lame attempt at stopping writes when the FS is 85% full.  There
+ * are countless ways this is incorrect, but it is better than nothing.
+ */
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+                           int for_del)
+{
+        u64 total;
+        u64 used;
+        u64 thresh;
+        int ret = 0;
+        spin_lock(&root->fs_info->delalloc_lock);
+        total = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        used = btrfs_super_bytes_used(&root->fs_info->super_copy);
+        if (for_del)
+                thresh = total * 90;
+        else
+                thresh = total * 85;
+        do_div(thresh, 100);
+        if (used + root->fs_info->delalloc_bytes + num_required > thresh)
+                ret = -ENOSPC;
+        spin_unlock(&root->fs_info->delalloc_lock);
+        return ret;
+}
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode,
+                                u64 start, size_t size, size_t compressed_size,
+                                struct page **compressed_pages)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct page *page = NULL;
+        char *kaddr;
+        unsigned long ptr;
+        struct btrfs_file_extent_item *ei;
+        int err = 0;
+        int ret;
+        size_t cur_size = size;
+        size_t datasize;
+        unsigned long offset;
+        int use_compress = 0;
+        if (compressed_size && compressed_pages) {
+                use_compress = 1;
+                cur_size = compressed_size;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        btrfs_set_trans_block_group(trans, inode);
+        key.objectid = inode->i_ino;
+        key.offset = start;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+        datasize = btrfs_file_extent_calc_inline_size(cur_size);
+        inode_add_bytes(inode, size);
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      datasize);
+        BUG_ON(ret);
+        if (ret) {
+                err = ret;
+                goto fail;
+        }
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+        btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+        btrfs_set_file_extent_encryption(leaf, ei, 0);
+        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+        ptr = btrfs_file_extent_inline_start(ei);
+        if (use_compress) {
+                struct page *cpage;
+                int i = 0;
+                while (compressed_size > 0) {
+                        cpage = compressed_pages[i];
+                        cur_size = min_t(unsigned long, compressed_size,
+                                       PAGE_CACHE_SIZE);
+                        kaddr = kmap(cpage);
+                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
+                        kunmap(cpage);
+                        i++;
+                        ptr += cur_size;
+                        compressed_size -= cur_size;
+                }
+                btrfs_set_file_extent_compression(leaf, ei,
+                                                  BTRFS_COMPRESS_ZLIB);
+        } else {
+                page = find_get_page(inode->i_mapping,
+                                     start >> PAGE_CACHE_SHIFT);
+                btrfs_set_file_extent_compression(leaf, ei, 0);
+                kaddr = kmap_atomic(page, KM_USER0);
+                offset = start & (PAGE_CACHE_SIZE - 1);
+                write_extent_buffer(leaf, kaddr + offset, ptr, size);
+                kunmap_atomic(kaddr, KM_USER0);
+                page_cache_release(page);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        BTRFS_I(inode)->disk_i_size = inode->i_size;
+        btrfs_update_inode(trans, root, inode);
+        return 0;
+fail:
+        btrfs_free_path(path);
+        return err;
+}
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct inode *inode, u64 start, u64 end,
+                                 size_t compressed_size,
+                                 struct page **compressed_pages)
+{
+        u64 isize = i_size_read(inode);
+        u64 actual_end = min(end + 1, isize);
+        u64 inline_len = actual_end - start;
+        u64 aligned_end = (end + root->sectorsize - 1) &
+                        ~((u64)root->sectorsize - 1);
+        u64 hint_byte;
+        u64 data_len = inline_len;
+        int ret;
+        if (compressed_size)
+                data_len = compressed_size;
+        if (start > 0 ||
+            actual_end >= PAGE_CACHE_SIZE ||
+            data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+            (!compressed_size &&
+            (actual_end & (root->sectorsize - 1)) == 0) ||
+            end + 1 < isize ||
+            data_len > root->fs_info->max_inline) {
+                return 1;
+        }
+        ret = btrfs_drop_extents(trans, root, inode, start,
+                                 aligned_end, start, &hint_byte);
+        BUG_ON(ret);
+        if (isize > actual_end)
+                inline_len = min_t(u64, isize, actual_end);
+        ret = insert_inline_extent(trans, root, inode, start,
+                                   inline_len, compressed_size,
+                                   compressed_pages);
+        BUG_ON(ret);
+        btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+        return 0;
+}
+struct async_extent {
+        u64 start;
+        u64 ram_size;
+        u64 compressed_size;
+        struct page **pages;
+        unsigned long nr_pages;
+        struct list_head list;
+};
+struct async_cow {
+        struct inode *inode;
+        struct btrfs_root *root;
+        struct page *locked_page;
+        u64 start;
+        u64 end;
+        struct list_head extents;
+        struct btrfs_work work;
+};
+static noinline int add_async_extent(struct async_cow *cow,
+                                     u64 start, u64 ram_size,
+                                     u64 compressed_size,
+                                     struct page **pages,
+                                     unsigned long nr_pages)
+{
+        struct async_extent *async_extent;
+        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+        async_extent->start = start;
+        async_extent->ram_size = ram_size;
+        async_extent->compressed_size = compressed_size;
+        async_extent->pages = pages;
+        async_extent->nr_pages = nr_pages;
+        list_add_tail(&async_extent->list, &cow->extents);
+        return 0;
+}
+/*
+ * we create compressed extents in two phases.  The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
+ *
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus.  The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
+ *
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes.  This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that pdflush sent them down.
+ */
+static noinline int compress_file_range(struct inode *inode,
+                                        struct page *locked_page,
+                                        u64 start, u64 end,
+                                        struct async_cow *async_cow,
+                                        int *num_added)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        u64 num_bytes;
+        u64 orig_start;
+        u64 disk_num_bytes;
+        u64 blocksize = root->sectorsize;
+        u64 actual_end;
+        u64 isize = i_size_read(inode);
+        int ret = 0;
+        struct page **pages = NULL;
+        unsigned long nr_pages;
+        unsigned long nr_pages_ret = 0;
+        unsigned long total_compressed = 0;
+        unsigned long total_in = 0;
+        unsigned long max_compressed = 128 * 1024;
+        unsigned long max_uncompressed = 128 * 1024;
+        int i;
+        int will_compress;
+        orig_start = start;
+        actual_end = min_t(u64, isize, end + 1);
+again:
+        will_compress = 0;
+        nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+        nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+        total_compressed = actual_end - start;
+        /* we want to make sure that amount of ram required to uncompress
+         * an extent is reasonable, so we limit the total size in ram
+         * of a compressed extent to 128k.  This is a crucial number
+         * because it also controls how easily we can spread reads across
+         * cpus for decompression.
+         *
+         * We also want to make sure the amount of IO required to do
+         * a random read is reasonably small, so we limit the size of
+         * a compressed extent to 128k.
+         */
+        total_compressed = min(total_compressed, max_uncompressed);
+        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+        num_bytes = max(blocksize,  num_bytes);
+        disk_num_bytes = num_bytes;
+        total_in = 0;
+        ret = 0;
+        /*
+         * we do compression for mount -o compress and when the
+         * inode has not been flagged as nocompress.  This flag can
+         * change at any time if we discover bad compression ratios.
+         */
+        if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+            btrfs_test_opt(root, COMPRESS)) {
+                WARN_ON(pages);
+                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+                ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+                                                total_compressed, pages,
+                                                nr_pages, &nr_pages_ret,
+                                                &total_in,
+                                                &total_compressed,
+                                                max_compressed);
+                if (!ret) {
+                        unsigned long offset = total_compressed &
+                                (PAGE_CACHE_SIZE - 1);
+                        struct page *page = pages[nr_pages_ret - 1];
+                        char *kaddr;
+                        /* zero the tail end of the last page, we might be
+                         * sending it down to disk
+                         */
+                        if (offset) {
+                                kaddr = kmap_atomic(page, KM_USER0);
+                                memset(kaddr + offset, 0,
+                                       PAGE_CACHE_SIZE - offset);
+                                kunmap_atomic(kaddr, KM_USER0);
+                        }
+                        will_compress = 1;
+                }
+        }
+        if (start == 0) {
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(!trans);
+                btrfs_set_trans_block_group(trans, inode);
+                /* lets try to make an inline extent */
+                if (ret || total_in < (actual_end - start)) {
+                        /* we didn't compress the entire range, try
+                         * to make an uncompressed inline extent.
+                         */
+                        ret = cow_file_range_inline(trans, root, inode,
+                                                    start, end, 0, NULL);
+                } else {
+                        /* try making a compressed inline extent */
+                        ret = cow_file_range_inline(trans, root, inode,
+                                                    start, end,
+                                                    total_compressed, pages);
+                }
+                btrfs_end_transaction(trans, root);
+                if (ret == 0) {
+                        /*
+                         * inline extent creation worked, we don't need
+                         * to create any more async work items.  Unlock
+                         * and free up our temp pages.
+                         */
+                        extent_clear_unlock_delalloc(inode,
+                                                     &BTRFS_I(inode)->io_tree,
+                                                     start, end, NULL, 1, 0,
+                                                     0, 1, 1, 1);
+                        ret = 0;
+                        goto free_pages_out;
+                }
+        }
+        if (will_compress) {
+                /*
+                 * we aren't doing an inline extent round the compressed size
+                 * up to a block size boundary so the allocator does sane
+                 * things
+                 */
+                total_compressed = (total_compressed + blocksize - 1) &
+                        ~(blocksize - 1);
+                /*
+                 * one last check to make sure the compression is really a
+                 * win, compare the page count read with the blocks on disk
+                 */
+                total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+                        ~(PAGE_CACHE_SIZE - 1);
+                if (total_compressed >= total_in) {
+                        will_compress = 0;
+                } else {
+                        disk_num_bytes = total_compressed;
+                        num_bytes = total_in;
+                }
+        }
+        if (!will_compress && pages) {
+                /*
+                 * the compression code ran but failed to make things smaller,
+                 * free any pages it allocated and our page pointer array
+                 */
+                for (i = 0; i < nr_pages_ret; i++) {
+                        WARN_ON(pages[i]->mapping);
+                        page_cache_release(pages[i]);
+                }
+                kfree(pages);
+                pages = NULL;
+                total_compressed = 0;
+                nr_pages_ret = 0;
+                /* flag the file so we don't compress in the future */
+                btrfs_set_flag(inode, NOCOMPRESS);
+        }
+        if (will_compress) {
+                *num_added += 1;
+                /* the async work queues will take care of doing actual
+                 * allocation on disk for these compressed pages,
+                 * and will submit them to the elevator.
+                 */
+                add_async_extent(async_cow, start, num_bytes,
+                                 total_compressed, pages, nr_pages_ret);
+                if (start + num_bytes < end && start + num_bytes < actual_end) {
+                        start += num_bytes;
+                        pages = NULL;
+                        cond_resched();
+                        goto again;
+                }
+        } else {
+                /*
+                 * No compression, but we still need to write the pages in
+                 * the file we've been given so far.  redirty the locked
+                 * page if it corresponds to our extent and set things up
+                 * for the async work queue to run cow_file_range to do
+                 * the normal delalloc dance
+                 */
+                if (page_offset(locked_page) >= start &&
+                    page_offset(locked_page) <= end) {
+                        __set_page_dirty_nobuffers(locked_page);
+                        /* unlocked later on in the async handlers */
+                }
+                add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+                *num_added += 1;
+        }
+out:
+        return 0;
+free_pages_out:
+        for (i = 0; i < nr_pages_ret; i++) {
+                WARN_ON(pages[i]->mapping);
+                page_cache_release(pages[i]);
+        }
+        kfree(pages);
+        goto out;
+}
+/*
+ * phase two of compressed writeback.  This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued.  We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline int submit_compressed_extents(struct inode *inode,
+                                              struct async_cow *async_cow)
+{
+        struct async_extent *async_extent;
+        u64 alloc_hint = 0;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key ins;
+        struct extent_map *em;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_io_tree *io_tree;
+        int ret;
+        if (list_empty(&async_cow->extents))
+                return 0;
+        trans = btrfs_join_transaction(root, 1);
+        while (!list_empty(&async_cow->extents)) {
+                async_extent = list_entry(async_cow->extents.next,
+                                          struct async_extent, list);
+                list_del(&async_extent->list);
+                io_tree = &BTRFS_I(inode)->io_tree;
+                /* did the compression code fall back to uncompressed IO? */
+                if (!async_extent->pages) {
+                        int page_started = 0;
+                        unsigned long nr_written = 0;
+                        lock_extent(io_tree, async_extent->start,
+                                    async_extent->start +
+                                    async_extent->ram_size - 1, GFP_NOFS);
+                        /* allocate blocks */
+                        cow_file_range(inode, async_cow->locked_page,
+                                       async_extent->start,
+                                       async_extent->start +
+                                       async_extent->ram_size - 1,
+                                       &page_started, &nr_written, 0);
+                        /*
+                         * if page_started, cow_file_range inserted an
+                         * inline extent and took care of all the unlocking
+                         * and IO for us.  Otherwise, we need to submit
+                         * all those pages down to the drive.
+                         */
+                        if (!page_started)
+                                extent_write_locked_range(io_tree,
+                                                  inode, async_extent->start,
+                                                  async_extent->start +
+                                                  async_extent->ram_size - 1,
+                                                  btrfs_get_extent,
+                                                  WB_SYNC_ALL);
+                        kfree(async_extent);
+                        cond_resched();
+                        continue;
+                }
+                lock_extent(io_tree, async_extent->start,
+                            async_extent->start + async_extent->ram_size - 1,
+                            GFP_NOFS);
+                /*
+                 * here we're doing allocation and writeback of the
+                 * compressed pages
+                 */
+                btrfs_drop_extent_cache(inode, async_extent->start,
+                                        async_extent->start +
+                                        async_extent->ram_size - 1, 0);
+                ret = btrfs_reserve_extent(trans, root,
+                                           async_extent->compressed_size,
+                                           async_extent->compressed_size,
+                                           0, alloc_hint,
+                                           (u64)-1, &ins, 1);
+                BUG_ON(ret);
+                em = alloc_extent_map(GFP_NOFS);
+                em->start = async_extent->start;
+                em->len = async_extent->ram_size;
+                em->orig_start = em->start;
+                em->block_start = ins.objectid;
+                em->block_len = ins.offset;
+                em->bdev = root->fs_info->fs_devices->latest_bdev;
+                set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                while (1) {
+                        spin_lock(&em_tree->lock);
+                        ret = add_extent_mapping(em_tree, em);
+                        spin_unlock(&em_tree->lock);
+                        if (ret != -EEXIST) {
+                                free_extent_map(em);
+                                break;
+                        }
+                        btrfs_drop_extent_cache(inode, async_extent->start,
+                                                async_extent->start +
+                                                async_extent->ram_size - 1, 0);
+                }
+                ret = btrfs_add_ordered_extent(inode, async_extent->start,
+                                               ins.objectid,
+                                               async_extent->ram_size,
+                                               ins.offset,
+                                               BTRFS_ORDERED_COMPRESSED);
+                BUG_ON(ret);
+                btrfs_end_transaction(trans, root);
+                /*
+                 * clear dirty, set writeback and unlock the pages.
+                 */
+                extent_clear_unlock_delalloc(inode,
+                                             &BTRFS_I(inode)->io_tree,
+                                             async_extent->start,
+                                             async_extent->start +
+                                             async_extent->ram_size - 1,
+                                             NULL, 1, 1, 0, 1, 1, 0);
+                ret = btrfs_submit_compressed_write(inode,
+                                    async_extent->start,
+                                    async_extent->ram_size,
+                                    ins.objectid,
+                                    ins.offset, async_extent->pages,
+                                    async_extent->nr_pages);
+                BUG_ON(ret);
+                trans = btrfs_join_transaction(root, 1);
+                alloc_hint = ins.objectid + ins.offset;
+                kfree(async_extent);
+                cond_resched();
+        }
+        btrfs_end_transaction(trans, root);
+        return 0;
+}
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+                                   struct page *locked_page,
+                                   u64 start, u64 end, int *page_started,
+                                   unsigned long *nr_written,
+                                   int unlock)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        u64 alloc_hint = 0;
+        u64 num_bytes;
+        unsigned long ram_size;
+        u64 disk_num_bytes;
+        u64 cur_alloc_size;
+        u64 blocksize = root->sectorsize;
+        u64 actual_end;
+        u64 isize = i_size_read(inode);
+        struct btrfs_key ins;
+        struct extent_map *em;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        int ret = 0;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(!trans);
+        btrfs_set_trans_block_group(trans, inode);
+        actual_end = min_t(u64, isize, end + 1);
+        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+        num_bytes = max(blocksize,  num_bytes);
+        disk_num_bytes = num_bytes;
+        ret = 0;
+        if (start == 0) {
+                /* lets try to make an inline extent */
+                ret = cow_file_range_inline(trans, root, inode,
+                                            start, end, 0, NULL);
+                if (ret == 0) {
+                        extent_clear_unlock_delalloc(inode,
+                                                     &BTRFS_I(inode)->io_tree,
+                                                     start, end, NULL, 1, 1,
+                                                     1, 1, 1, 1);
+                        *nr_written = *nr_written +
+                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+                        *page_started = 1;
+                        ret = 0;
+                        goto out;
+                }
+        }
+        BUG_ON(disk_num_bytes >
+               btrfs_super_total_bytes(&root->fs_info->super_copy));
+        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+        while (disk_num_bytes > 0) {
+                cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+                ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+                                           root->sectorsize, 0, alloc_hint,
+                                           (u64)-1, &ins, 1);
+                BUG_ON(ret);
+                em = alloc_extent_map(GFP_NOFS);
+                em->start = start;
+                em->orig_start = em->start;
+                ram_size = ins.offset;
+                em->len = ins.offset;
+                em->block_start = ins.objectid;
+                em->block_len = ins.offset;
+                em->bdev = root->fs_info->fs_devices->latest_bdev;
+                set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                while (1) {
+                        spin_lock(&em_tree->lock);
+                        ret = add_extent_mapping(em_tree, em);
+                        spin_unlock(&em_tree->lock);
+                        if (ret != -EEXIST) {
+                                free_extent_map(em);
+                                break;
+                        }
+                        btrfs_drop_extent_cache(inode, start,
+                                                start + ram_size - 1, 0);
+                }
+                cur_alloc_size = ins.offset;
+                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+                                               ram_size, cur_alloc_size, 0);
+                BUG_ON(ret);
+                if (root->root_key.objectid ==
+                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                        ret = btrfs_reloc_clone_csums(inode, start,
+                                                      cur_alloc_size);
+                        BUG_ON(ret);
+                }
+                if (disk_num_bytes < cur_alloc_size)
+                        break;
+                /* we're not doing compressed IO, don't unlock the first
+                 * page (which the caller expects to stay locked), don't
+                 * clear any dirty bits and don't set any writeback bits
+                 */
+                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                             start, start + ram_size - 1,
+                                             locked_page, unlock, 1,
+                                             1, 0, 0, 0);
+                disk_num_bytes -= cur_alloc_size;
+                num_bytes -= cur_alloc_size;
+                alloc_hint = ins.objectid + ins.offset;
+                start += cur_alloc_size;
+        }
+out:
+        ret = 0;
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+        struct async_cow *async_cow;
+        int num_added = 0;
+        async_cow = container_of(work, struct async_cow, work);
+        compress_file_range(async_cow->inode, async_cow->locked_page,
+                            async_cow->start, async_cow->end, async_cow,
+                            &num_added);
+        if (num_added == 0)
+                async_cow->inode = NULL;
+}
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+        struct async_cow *async_cow;
+        struct btrfs_root *root;
+        unsigned long nr_pages;
+        async_cow = container_of(work, struct async_cow, work);
+        root = async_cow->root;
+        nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+                PAGE_CACHE_SHIFT;
+        atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
+        if (atomic_read(&root->fs_info->async_delalloc_pages) <
+            5 * 1042 * 1024 &&
+            waitqueue_active(&root->fs_info->async_submit_wait))
+                wake_up(&root->fs_info->async_submit_wait);
+        if (async_cow->inode)
+                submit_compressed_extents(async_cow->inode, async_cow);
+}
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+        struct async_cow *async_cow;
+        async_cow = container_of(work, struct async_cow, work);
+        kfree(async_cow);
+}
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+                                u64 start, u64 end, int *page_started,
+                                unsigned long *nr_written)
+{
+        struct async_cow *async_cow;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        unsigned long nr_pages;
+        u64 cur_end;
+        int limit = 10 * 1024 * 1042;
+        if (!btrfs_test_opt(root, COMPRESS)) {
+                return cow_file_range(inode, locked_page, start, end,
+                                      page_started, nr_written, 1);
+        }
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
+                         EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+        while (start < end) {
+                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+                async_cow->inode = inode;
+                async_cow->root = root;
+                async_cow->locked_page = locked_page;
+                async_cow->start = start;
+                if (btrfs_test_flag(inode, NOCOMPRESS))
+                        cur_end = end;
+                else
+                        cur_end = min(end, start + 512 * 1024 - 1);
+                async_cow->end = cur_end;
+                INIT_LIST_HEAD(&async_cow->extents);
+                async_cow->work.func = async_cow_start;
+                async_cow->work.ordered_func = async_cow_submit;
+                async_cow->work.ordered_free = async_cow_free;
+                async_cow->work.flags = 0;
+                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+                        PAGE_CACHE_SHIFT;
+                atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+                btrfs_queue_worker(&root->fs_info->delalloc_workers,
+                                   &async_cow->work);
+                if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+                        wait_event(root->fs_info->async_submit_wait,
+                           (atomic_read(&root->fs_info->async_delalloc_pages) <
+                            limit));
+                }
+                while (atomic_read(&root->fs_info->async_submit_draining) &&
+                      atomic_read(&root->fs_info->async_delalloc_pages)) {
+                        wait_event(root->fs_info->async_submit_wait,
+                          (atomic_read(&root->fs_info->async_delalloc_pages) ==
+                           0));
+                }
+                *nr_written += nr_pages;
+                start = cur_end + 1;
+        }
+        *page_started = 1;
+        return 0;
+}
+static noinline int csum_exist_in_range(struct btrfs_root *root,
+                                        u64 bytenr, u64 num_bytes)
+{
+        int ret;
+        struct btrfs_ordered_sum *sums;
+        LIST_HEAD(list);
+        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+                                       bytenr + num_bytes - 1, &list);
+        if (ret == 0 && list_empty(&list))
+                return 0;
+        while (!list_empty(&list)) {
+                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+                list_del(&sums->list);
+                kfree(sums);
+        }
+        return 1;
+}
+/*
+ * when nowcow writeback call back.  This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+                              u64 start, u64 end, int *page_started, int force,
+                              unsigned long *nr_written)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct extent_buffer *leaf;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key found_key;
+        u64 cow_start;
+        u64 cur_offset;
+        u64 extent_end;
+        u64 disk_bytenr;
+        u64 num_bytes;
+        int extent_type;
+        int ret;
+        int type;
+        int nocow;
+        int check_prev = 1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(!trans);
+        cow_start = (u64)-1;
+        cur_offset = start;
+        while (1) {
+                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                               cur_offset, 0);
+                BUG_ON(ret < 0);
+                if (ret > 0 && path->slots[0] > 0 && check_prev) {
+                        leaf = path->nodes[0];
+                        btrfs_item_key_to_cpu(leaf, &found_key,
+                                              path->slots[0] - 1);
+                        if (found_key.objectid == inode->i_ino &&
+                            found_key.type == BTRFS_EXTENT_DATA_KEY)
+                                path->slots[0]--;
+                }
+                check_prev = 0;
+next_slot:
+                leaf = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                BUG_ON(1);
+                        if (ret > 0)
+                                break;
+                        leaf = path->nodes[0];
+                }
+                nocow = 0;
+                disk_bytenr = 0;
+                num_bytes = 0;
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid > inode->i_ino ||
+                    found_key.type > BTRFS_EXTENT_DATA_KEY ||
+                    found_key.offset > end)
+                        break;
+                if (found_key.offset > cur_offset) {
+                        extent_end = found_key.offset;
+                        goto out_check;
+                }
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                extent_type = btrfs_file_extent_type(leaf, fi);
+                if (extent_type == BTRFS_FILE_EXTENT_REG ||
+                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                        extent_end = found_key.offset +
+                                btrfs_file_extent_num_bytes(leaf, fi);
+                        if (extent_end <= start) {
+                                path->slots[0]++;
+                                goto next_slot;
+                        }
+                        if (disk_bytenr == 0)
+                                goto out_check;
+                        if (btrfs_file_extent_compression(leaf, fi) ||
+                            btrfs_file_extent_encryption(leaf, fi) ||
+                            btrfs_file_extent_other_encoding(leaf, fi))
+                                goto out_check;
+                        if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+                                goto out_check;
+                        if (btrfs_extent_readonly(root, disk_bytenr))
+                                goto out_check;
+                        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                                  disk_bytenr))
+                                goto out_check;
+                        disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+                        disk_bytenr += cur_offset - found_key.offset;
+                        num_bytes = min(end + 1, extent_end) - cur_offset;
+                        /*
+                         * force cow if csum exists in the range.
+                         * this ensure that csum for a given extent are
+                         * either valid or do not exist.
+                         */
+                        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                                goto out_check;
+                        nocow = 1;
+                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                        extent_end = found_key.offset +
+                                btrfs_file_extent_inline_len(leaf, fi);
+                        extent_end = ALIGN(extent_end, root->sectorsize);
+                } else {
+                        BUG_ON(1);
+                }
+out_check:
+                if (extent_end <= start) {
+                        path->slots[0]++;
+                        goto next_slot;
+                }
+                if (!nocow) {
+                        if (cow_start == (u64)-1)
+                                cow_start = cur_offset;
+                        cur_offset = extent_end;
+                        if (cur_offset > end)
+                                break;
+                        path->slots[0]++;
+                        goto next_slot;
+                }
+                btrfs_release_path(root, path);
+                if (cow_start != (u64)-1) {
+                        ret = cow_file_range(inode, locked_page, cow_start,
+                                        found_key.offset - 1, page_started,
+                                        nr_written, 1);
+                        BUG_ON(ret);
+                        cow_start = (u64)-1;
+                }
+                if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                        struct extent_map *em;
+                        struct extent_map_tree *em_tree;
+                        em_tree = &BTRFS_I(inode)->extent_tree;
+                        em = alloc_extent_map(GFP_NOFS);
+                        em->start = cur_offset;
+                        em->orig_start = em->start;
+                        em->len = num_bytes;
+                        em->block_len = num_bytes;
+                        em->block_start = disk_bytenr;
+                        em->bdev = root->fs_info->fs_devices->latest_bdev;
+                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                        while (1) {
+                                spin_lock(&em_tree->lock);
+                                ret = add_extent_mapping(em_tree, em);
+                                spin_unlock(&em_tree->lock);
+                                if (ret != -EEXIST) {
+                                        free_extent_map(em);
+                                        break;
+                                }
+                                btrfs_drop_extent_cache(inode, em->start,
+                                                em->start + em->len - 1, 0);
+                        }
+                        type = BTRFS_ORDERED_PREALLOC;
+                } else {
+                        type = BTRFS_ORDERED_NOCOW;
+                }
+                ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
+                                               num_bytes, num_bytes, type);
+                BUG_ON(ret);
+                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                        cur_offset, cur_offset + num_bytes - 1,
+                                        locked_page, 1, 1, 1, 0, 0, 0);
+                cur_offset = extent_end;
+                if (cur_offset > end)
+                        break;
+        }
+        btrfs_release_path(root, path);
+        if (cur_offset <= end && cow_start == (u64)-1)
+                cow_start = cur_offset;
+        if (cow_start != (u64)-1) {
+                ret = cow_file_range(inode, locked_page, cow_start, end,
+                                     page_started, nr_written, 1);
+                BUG_ON(ret);
+        }
+        ret = btrfs_end_transaction(trans, root);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return 0;
+}
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+                              u64 start, u64 end, int *page_started,
+                              unsigned long *nr_written)
+{
+        int ret;
+        if (btrfs_test_flag(inode, NODATACOW))
+                ret = run_delalloc_nocow(inode, locked_page, start, end,
+                                         page_started, 1, nr_written);
+        else if (btrfs_test_flag(inode, PREALLOC))
+                ret = run_delalloc_nocow(inode, locked_page, start, end,
+                                         page_started, 0, nr_written);
+        else
+                ret = cow_file_range_async(inode, locked_page, start, end,
+                                           page_started, nr_written);
+        return ret;
+}
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
+static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+                       unsigned long old, unsigned long bits)
+{
+        /*
+         * set_bit and clear bit hooks normally require _irqsave/restore
+         * but in this case, we are only testeing for the DELALLOC
+         * bit, which is only set or cleared with irqs on
+         */
+        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+                struct btrfs_root *root = BTRFS_I(inode)->root;
+                spin_lock(&root->fs_info->delalloc_lock);
+                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+                root->fs_info->delalloc_bytes += end - start + 1;
+                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                                      &root->fs_info->delalloc_inodes);
+                }
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
+        return 0;
+}
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
+static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+                         unsigned long old, unsigned long bits)
+{
+        /*
+         * set_bit and clear bit hooks normally require _irqsave/restore
+         * but in this case, we are only testeing for the DELALLOC
+         * bit, which is only set or cleared with irqs on
+         */
+        if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+                struct btrfs_root *root = BTRFS_I(inode)->root;
+                spin_lock(&root->fs_info->delalloc_lock);
+                if (end - start + 1 > root->fs_info->delalloc_bytes) {
+                        printk(KERN_INFO "btrfs warning: delalloc account "
+                               "%llu %llu\n",
+                               (unsigned long long)end - start + 1,
+                               (unsigned long long)
+                               root->fs_info->delalloc_bytes);
+                        root->fs_info->delalloc_bytes = 0;
+                        BTRFS_I(inode)->delalloc_bytes = 0;
+                } else {
+                        root->fs_info->delalloc_bytes -= end - start + 1;
+                        BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+                }
+                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+                }
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
+        return 0;
+}
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+                         size_t size, struct bio *bio,
+                         unsigned long bio_flags)
+{
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        struct btrfs_mapping_tree *map_tree;
+        u64 logical = (u64)bio->bi_sector << 9;
+        u64 length = 0;
+        u64 map_length;
+        int ret;
+        if (bio_flags & EXTENT_BIO_COMPRESSED)
+                return 0;
+        length = bio->bi_size;
+        map_tree = &root->fs_info->mapping_tree;
+        map_length = length;
+        ret = btrfs_map_block(map_tree, READ, logical,
+                              &map_length, NULL, 0);
+        if (map_length < length + size)
+                return 1;
+        return 0;
+}
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+                                    struct bio *bio, int mirror_num,
+                                    unsigned long bio_flags)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+        BUG_ON(ret);
+        return 0;
+}
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+                          int mirror_num, unsigned long bio_flags)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+}
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation
+ * on write, or reading the csums from the tree before a read
+ */
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+                          int mirror_num, unsigned long bio_flags)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        int skip_sum;
+        skip_sum = btrfs_test_flag(inode, NODATASUM);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        BUG_ON(ret);
+        if (!(rw & (1 << BIO_RW))) {
+                if (bio_flags & EXTENT_BIO_COMPRESSED) {
+                        return btrfs_submit_compressed_read(inode, bio,
+                                                    mirror_num, bio_flags);
+                } else if (!skip_sum)
+                        btrfs_lookup_bio_sums(root, inode, bio, NULL);
+                goto mapit;
+        } else if (!skip_sum) {
+                /* csum items have already been cloned */
+                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+                        goto mapit;
+                /* we're doing a write, do the async checksumming */
+                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                   inode, rw, bio, mirror_num,
+                                   bio_flags, __btrfs_submit_bio_start,
+                                   __btrfs_submit_bio_done);
+        }
+mapit:
+        return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+}
+/*
+ * given a list of ordered sums record them in the inode.  This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
+static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
+                             struct inode *inode, u64 file_offset,
+                             struct list_head *list)
+{
+        struct list_head *cur;
+        struct btrfs_ordered_sum *sum;
+        btrfs_set_trans_block_group(trans, inode);
+        list_for_each(cur, list) {
+                sum = list_entry(cur, struct btrfs_ordered_sum, list);
+                btrfs_csum_file_blocks(trans,
+                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
+        }
+        return 0;
+}
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+{
+        if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+                WARN_ON(1);
+        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+                                   GFP_NOFS);
+}
+/* see btrfs_writepage_start_hook for details on why this is required */
+struct btrfs_writepage_fixup {
+        struct page *page;
+        struct btrfs_work work;
+};
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+        struct btrfs_writepage_fixup *fixup;
+        struct btrfs_ordered_extent *ordered;
+        struct page *page;
+        struct inode *inode;
+        u64 page_start;
+        u64 page_end;
+        fixup = container_of(work, struct btrfs_writepage_fixup, work);
+        page = fixup->page;
+again:
+        lock_page(page);
+        if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+                ClearPageChecked(page);
+                goto out_page;
+        }
+        inode = page->mapping->host;
+        page_start = page_offset(page);
+        page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+        lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+        /* already ordered? We're done */
+        if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+                             EXTENT_ORDERED, 0)) {
+                goto out;
+        }
+        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+        if (ordered) {
+                unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
+                              page_end, GFP_NOFS);
+                unlock_page(page);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                goto again;
+        }
+        btrfs_set_extent_delalloc(inode, page_start, page_end);
+        ClearPageChecked(page);
+out:
+        unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+out_page:
+        unlock_page(page);
+        page_cache_release(page);
+}
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea.  This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the ORDERED bit set
+ * hasn't been properly setup for IO.  We kick off an async process
+ * to fix it up.  The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+        struct inode *inode = page->mapping->host;
+        struct btrfs_writepage_fixup *fixup;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+                             EXTENT_ORDERED, 0);
+        if (ret)
+                return 0;
+        if (PageChecked(page))
+                return -EAGAIN;
+        fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+        if (!fixup)
+                return -EAGAIN;
+        SetPageChecked(page);
+        page_cache_get(page);
+        fixup->work.func = btrfs_writepage_fixup_worker;
+        fixup->page = page;
+        btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+        return -EAGAIN;
+}
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+                                       struct inode *inode, u64 file_pos,
+                                       u64 disk_bytenr, u64 disk_num_bytes,
+                                       u64 num_bytes, u64 ram_bytes,
+                                       u8 compression, u8 encryption,
+                                       u16 other_encoding, int extent_type)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_key ins;
+        u64 hint;
+        int ret;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_drop_extents(trans, root, inode, file_pos,
+                                 file_pos + num_bytes, file_pos, &hint);
+        BUG_ON(ret);
+        ins.objectid = inode->i_ino;
+        ins.offset = file_pos;
+        ins.type = BTRFS_EXTENT_DATA_KEY;
+        ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+        btrfs_set_file_extent_type(leaf, fi, extent_type);
+        btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+        btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+        btrfs_set_file_extent_offset(leaf, fi, 0);
+        btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+        btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+        btrfs_set_file_extent_compression(leaf, fi, compression);
+        btrfs_set_file_extent_encryption(leaf, fi, encryption);
+        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+        btrfs_mark_buffer_dirty(leaf);
+        inode_add_bytes(inode, num_bytes);
+        btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
+        ins.objectid = disk_bytenr;
+        ins.offset = disk_num_bytes;
+        ins.type = BTRFS_EXTENT_ITEM_KEY;
+        ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+                                          root->root_key.objectid,
+                                          trans->transid, inode->i_ino, &ins);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return 0;
+}
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_ordered_extent *ordered_extent;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int compressed = 0;
+        int ret;
+        ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+        if (!ret)
+                return 0;
+        trans = btrfs_join_transaction(root, 1);
+        ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+        BUG_ON(!ordered_extent);
+        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
+                goto nocow;
+        lock_extent(io_tree, ordered_extent->file_offset,
+                    ordered_extent->file_offset + ordered_extent->len - 1,
+                    GFP_NOFS);
+        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+                compressed = 1;
+        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+                BUG_ON(compressed);
+                ret = btrfs_mark_extent_written(trans, root, inode,
+                                                ordered_extent->file_offset,
+                                                ordered_extent->file_offset +
+                                                ordered_extent->len);
+                BUG_ON(ret);
+        } else {
+                ret = insert_reserved_file_extent(trans, inode,
+                                                ordered_extent->file_offset,
+                                                ordered_extent->start,
+                                                ordered_extent->disk_len,
+                                                ordered_extent->len,
+                                                ordered_extent->len,
+                                                compressed, 0, 0,
+                                                BTRFS_FILE_EXTENT_REG);
+                BUG_ON(ret);
+        }
+        unlock_extent(io_tree, ordered_extent->file_offset,
+                    ordered_extent->file_offset + ordered_extent->len - 1,
+                    GFP_NOFS);
+nocow:
+        add_pending_csums(trans, inode, ordered_extent->file_offset,
+                          &ordered_extent->list);
+        mutex_lock(&BTRFS_I(inode)->extent_mutex);
+        btrfs_ordered_update_i_size(inode, ordered_extent);
+        btrfs_update_inode(trans, root, inode);
+        btrfs_remove_ordered_extent(inode, ordered_extent);
+        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+        /* once for us */
+        btrfs_put_ordered_extent(ordered_extent);
+        /* once for the tree */
+        btrfs_put_ordered_extent(ordered_extent);
+        btrfs_end_transaction(trans, root);
+        return 0;
+}
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+                                struct extent_state *state, int uptodate)
+{
+        return btrfs_finish_ordered_io(page->mapping->host, start, end);
+}
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+        struct page *page;
+        u64 start;
+        u64 len;
+        u64 logical;
+        unsigned long bio_flags;
+        int last_mirror;
+};
+static int btrfs_io_failed_hook(struct bio *failed_bio,
+                         struct page *page, u64 start, u64 end,
+                         struct extent_state *state)
+{
+        struct io_failure_record *failrec = NULL;
+        u64 private;
+        struct extent_map *em;
+        struct inode *inode = page->mapping->host;
+        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct bio *bio;
+        int num_copies;
+        int ret;
+        int rw;
+        u64 logical;
+        ret = get_state_private(failure_tree, start, &private);
+        if (ret) {
+                failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
+                if (!failrec)
+                        return -ENOMEM;
+                failrec->start = start;
+                failrec->len = end - start + 1;
+                failrec->last_mirror = 0;
+                failrec->bio_flags = 0;
+                spin_lock(&em_tree->lock);
+                em = lookup_extent_mapping(em_tree, start, failrec->len);
+                if (em->start > start || em->start + em->len < start) {
+                        free_extent_map(em);
+                        em = NULL;
+                }
+                spin_unlock(&em_tree->lock);
+                if (!em || IS_ERR(em)) {
+                        kfree(failrec);
+                        return -EIO;
+                }
+                logical = start - em->start;
+                logical = em->block_start + logical;
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+                        logical = em->block_start;
+                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                }
+                failrec->logical = logical;
+                free_extent_map(em);
+                set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
+                                EXTENT_DIRTY, GFP_NOFS);
+                set_state_private(failure_tree, start,
+                                 (u64)(unsigned long)failrec);
+        } else {
+                failrec = (struct io_failure_record *)(unsigned long)private;
+        }
+        num_copies = btrfs_num_copies(
+                              &BTRFS_I(inode)->root->fs_info->mapping_tree,
+                              failrec->logical, failrec->len);
+        failrec->last_mirror++;
+        if (!state) {
+                spin_lock(&BTRFS_I(inode)->io_tree.lock);
+                state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+                                                    failrec->start,
+                                                    EXTENT_LOCKED);
+                if (state && state->start != failrec->start)
+                        state = NULL;
+                spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+        }
+        if (!state || failrec->last_mirror > num_copies) {
+                set_state_private(failure_tree, failrec->start, 0);
+                clear_extent_bits(failure_tree, failrec->start,
+                                  failrec->start + failrec->len - 1,
+                                  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+                kfree(failrec);
+                return -EIO;
+        }
+        bio = bio_alloc(GFP_NOFS, 1);
+        bio->bi_private = state;
+        bio->bi_end_io = failed_bio->bi_end_io;
+        bio->bi_sector = failrec->logical >> 9;
+        bio->bi_bdev = failed_bio->bi_bdev;
+        bio->bi_size = 0;
+        bio_add_page(bio, page, failrec->len, start - page_offset(page));
+        if (failed_bio->bi_rw & (1 << BIO_RW))
+                rw = WRITE;
+        else
+                rw = READ;
+        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+                                                      failrec->last_mirror,
+                                                      failrec->bio_flags);
+        return 0;
+}
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int btrfs_clean_io_failures(struct inode *inode, u64 start)
+{
+        u64 private;
+        u64 private_failure;
+        struct io_failure_record *failure;
+        int ret;
+        private = 0;
+        if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+                             (u64)-1, 1, EXTENT_DIRTY)) {
+                ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+                                        start, &private_failure);
+                if (ret == 0) {
+                        failure = (struct io_failure_record *)(unsigned long)
+                                   private_failure;
+                        set_state_private(&BTRFS_I(inode)->io_failure_tree,
+                                          failure->start, 0);
+                        clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+                                          failure->start,
+                                          failure->start + failure->len - 1,
+                                          EXTENT_DIRTY | EXTENT_LOCKED,
+                                          GFP_NOFS);
+                        kfree(failure);
+                }
+        }
+        return 0;
+}
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish.  If not, we go through
+ * the io_failure_record routines to find good copies
+ */
+static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+                               struct extent_state *state)
+{
+        size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+        struct inode *inode = page->mapping->host;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        char *kaddr;
+        u64 private = ~(u32)0;
+        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u32 csum = ~(u32)0;
+        if (PageChecked(page)) {
+                ClearPageChecked(page);
+                goto good;
+        }
+        if (btrfs_test_flag(inode, NODATASUM))
+                return 0;
+        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+                clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+                                  GFP_NOFS);
+                return 0;
+        }
+        if (state && state->start == start) {
+                private = state->private;
+                ret = 0;
+        } else {
+                ret = get_state_private(io_tree, start, &private);
+        }
+        kaddr = kmap_atomic(page, KM_USER0);
+        if (ret)
+                goto zeroit;
+        csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
+        btrfs_csum_final(csum, (char *)&csum);
+        if (csum != private)
+                goto zeroit;
+        kunmap_atomic(kaddr, KM_USER0);
+good:
+        /* if the io failure tree for this inode is non-empty,
+         * check to see if we've recovered from a failed IO
+         */
+        btrfs_clean_io_failures(inode, start);
+        return 0;
+zeroit:
+        printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+               "private %llu\n", page->mapping->host->i_ino,
+               (unsigned long long)start, csum,
+               (unsigned long long)private);
+        memset(kaddr + offset, 1, end - start + 1);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (private == 0)
+                return 0;
+        return -EIO;
+}
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        spin_lock(&root->list_lock);
+        /* already on the orphan list, we're good */
+        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+                spin_unlock(&root->list_lock);
+                return 0;
+        }
+        list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+        spin_unlock(&root->list_lock);
+        /*
+         * insert an orphan item to track this unlinked/truncated file
+         */
+        ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+        return ret;
+}
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        spin_lock(&root->list_lock);
+        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+                spin_unlock(&root->list_lock);
+                return 0;
+        }
+        list_del_init(&BTRFS_I(inode)->i_orphan);
+        if (!trans) {
+                spin_unlock(&root->list_lock);
+                return 0;
+        }
+        spin_unlock(&root->list_lock);
+        ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+        return ret;
+}
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+void btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        struct btrfs_key key, found_key;
+        struct btrfs_trans_handle *trans;
+        struct inode *inode;
+        int ret = 0, nr_unlink = 0, nr_truncate = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return;
+        path->reada = -1;
+        key.objectid = BTRFS_ORPHAN_OBJECTID;
+        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.offset = (u64)-1;
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0) {
+                        printk(KERN_ERR "Error searching slot for orphan: %d"
+                               "\n", ret);
+                        break;
+                }
+                /*
+                 * if ret == 0 means we found what we were searching for, which
+                 * is weird, but possible, so only screw with path if we didnt
+                 * find the key and see if we have stuff that matches
+                 */
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                /* pull out the item */
+                leaf = path->nodes[0];
+                item = btrfs_item_nr(leaf, path->slots[0]);
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                /* make sure the item matches what we want */
+                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+                        break;
+                if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+                        break;
+                /* release the path since we're done with it */
+                btrfs_release_path(root, path);
+                /*
+                 * this is where we are basically btrfs_lookup, without the
+                 * crossing root thing.  we store the inode number in the
+                 * offset of the orphan item.
+                 */
+                inode = btrfs_iget_locked(root->fs_info->sb,
+                                          found_key.offset, root);
+                if (!inode)
+                        break;
+                if (inode->i_state & I_NEW) {
+                        BTRFS_I(inode)->root = root;
+                        /* have to set the location manually */
+                        BTRFS_I(inode)->location.objectid = inode->i_ino;
+                        BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+                        BTRFS_I(inode)->location.offset = 0;
+                        btrfs_read_locked_inode(inode);
+                        unlock_new_inode(inode);
+                }
+                /*
+                 * add this inode to the orphan list so btrfs_orphan_del does
+                 * the proper thing when we hit it
+                 */
+                spin_lock(&root->list_lock);
+                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+                spin_unlock(&root->list_lock);
+                /*
+                 * if this is a bad inode, means we actually succeeded in
+                 * removing the inode, but not the orphan record, which means
+                 * we need to manually delete the orphan since iput will just
+                 * do a destroy_inode
+                 */
+                if (is_bad_inode(inode)) {
+                        trans = btrfs_start_transaction(root, 1);
+                        btrfs_orphan_del(trans, inode);
+                        btrfs_end_transaction(trans, root);
+                        iput(inode);
+                        continue;
+                }
+                /* if we have links, this was a truncate, lets do that */
+                if (inode->i_nlink) {
+                        nr_truncate++;
+                        btrfs_truncate(inode);
+                } else {
+                        nr_unlink++;
+                }
+                /* this will do delete_inode and everything for us */
+                iput(inode);
+        }
+        if (nr_unlink)
+                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+        if (nr_truncate)
+                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+        btrfs_free_path(path);
+}
+/*
+ * read an inode from the btree into the in-memory inode
+ */
+void btrfs_read_locked_inode(struct inode *inode)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_inode_item *inode_item;
+        struct btrfs_timespec *tspec;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_key location;
+        u64 alloc_group_block;
+        u32 rdev;
+        int ret;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+        if (ret)
+                goto make_bad;
+        leaf = path->nodes[0];
+        inode_item = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_inode_item);
+        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+        inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+        inode->i_uid = btrfs_inode_uid(leaf, inode_item);
+        inode->i_gid = btrfs_inode_gid(leaf, inode_item);
+        btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+        tspec = btrfs_inode_atime(inode_item);
+        inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+        inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+        tspec = btrfs_inode_mtime(inode_item);
+        inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+        inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+        tspec = btrfs_inode_ctime(inode_item);
+        inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+        inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
+        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+        BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+        inode->i_generation = BTRFS_I(inode)->generation;
+        inode->i_rdev = 0;
+        rdev = btrfs_inode_rdev(leaf, inode_item);
+        BTRFS_I(inode)->index_cnt = (u64)-1;
+        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+        alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+                                                alloc_group_block, 0);
+        btrfs_free_path(path);
+        inode_item = NULL;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_mapping->a_ops = &btrfs_aops;
+                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+                inode->i_fop = &btrfs_file_operations;
+                inode->i_op = &btrfs_file_inode_operations;
+                break;
+        case S_IFDIR:
+                inode->i_fop = &btrfs_dir_file_operations;
+                if (root == root->fs_info->tree_root)
+                        inode->i_op = &btrfs_dir_ro_inode_operations;
+                else
+                        inode->i_op = &btrfs_dir_inode_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &btrfs_symlink_inode_operations;
+                inode->i_mapping->a_ops = &btrfs_symlink_aops;
+                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+                break;
+        default:
+                init_special_inode(inode, inode->i_mode, rdev);
+                break;
+        }
+        return;
+make_bad:
+        btrfs_free_path(path);
+        make_bad_inode(inode);
+}
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+                            struct extent_buffer *leaf,
+                            struct btrfs_inode_item *item,
+                            struct inode *inode)
+{
+        btrfs_set_inode_uid(leaf, item, inode->i_uid);
+        btrfs_set_inode_gid(leaf, item, inode->i_gid);
+        btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+        btrfs_set_inode_mode(leaf, item, inode->i_mode);
+        btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+        btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+                               inode->i_atime.tv_sec);
+        btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+                                inode->i_atime.tv_nsec);
+        btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+                               inode->i_mtime.tv_sec);
+        btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+                                inode->i_mtime.tv_nsec);
+        btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+                               inode->i_ctime.tv_sec);
+        btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+                                inode->i_ctime.tv_nsec);
+        btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+        btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+        btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+        btrfs_set_inode_transid(leaf, item, trans->transid);
+        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+        btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+}
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode)
+{
+        struct btrfs_inode_item *inode_item;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_lookup_inode(trans, root, path,
+                                 &BTRFS_I(inode)->location, 1);
+        if (ret) {
+                if (ret > 0)
+                        ret = -ENOENT;
+                goto failed;
+        }
+        leaf = path->nodes[0];
+        inode_item = btrfs_item_ptr(leaf, path->slots[0],
+                                  struct btrfs_inode_item);
+        fill_inode_item(trans, leaf, inode_item, inode);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_set_inode_last_trans(trans, inode);
+        ret = 0;
+failed:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code.  It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct inode *dir, struct inode *inode,
+                       const char *name, int name_len)
+{
+        struct btrfs_path *path;
+        int ret = 0;
+        struct extent_buffer *leaf;
+        struct btrfs_dir_item *di;
+        struct btrfs_key key;
+        u64 index;
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                    name, name_len, -1);
+        if (IS_ERR(di)) {
+                ret = PTR_ERR(di);
+                goto err;
+        }
+        if (!di) {
+                ret = -ENOENT;
+                goto err;
+        }
+        leaf = path->nodes[0];
+        btrfs_dir_item_key_to_cpu(leaf, di, &key);
+        ret = btrfs_delete_one_dir_name(trans, root, path, di);
+        if (ret)
+                goto err;
+        btrfs_release_path(root, path);
+        ret = btrfs_del_inode_ref(trans, root, name, name_len,
+                                  inode->i_ino,
+                                  dir->i_ino, &index);
+        if (ret) {
+                printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
+                       "inode %lu parent %lu\n", name_len, name,
+                       inode->i_ino, dir->i_ino);
+                goto err;
+        }
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+                                         index, name, name_len, -1);
+        if (IS_ERR(di)) {
+                ret = PTR_ERR(di);
+                goto err;
+        }
+        if (!di) {
+                ret = -ENOENT;
+                goto err;
+        }
+        ret = btrfs_delete_one_dir_name(trans, root, path, di);
+        btrfs_release_path(root, path);
+        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+                                         inode, dir->i_ino);
+        BUG_ON(ret != 0 && ret != -ENOENT);
+        if (ret != -ENOENT)
+                BTRFS_I(dir)->log_dirty_trans = trans->transid;
+        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+                                           dir, index);
+        BUG_ON(ret);
+err:
+        btrfs_free_path(path);
+        if (ret)
+                goto out;
+        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        btrfs_update_inode(trans, root, dir);
+        btrfs_drop_nlink(inode);
+        ret = btrfs_update_inode(trans, root, inode);
+        dir->i_sb->s_dirt = 1;
+out:
+        return ret;
+}
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct btrfs_root *root;
+        struct btrfs_trans_handle *trans;
+        struct inode *inode = dentry->d_inode;
+        int ret;
+        unsigned long nr = 0;
+        root = BTRFS_I(dir)->root;
+        ret = btrfs_check_free_space(root, 1, 1);
+        if (ret)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+                                 dentry->d_name.name, dentry->d_name.len);
+        if (inode->i_nlink == 0)
+                ret = btrfs_orphan_add(trans, inode);
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+fail:
+        btrfs_btree_balance_dirty(root, nr);
+        return ret;
+}
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        int err = 0;
+        int ret;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr = 0;
+        /*
+         * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
+         * the root of a subvolume or snapshot
+         */
+        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+                return -ENOTEMPTY;
+        }
+        ret = btrfs_check_free_space(root, 1, 1);
+        if (ret)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_orphan_add(trans, inode);
+        if (err)
+                goto fail_trans;
+        /* now the directory is empty */
+        err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+                                 dentry->d_name.name, dentry->d_name.len);
+        if (!err)
+                btrfs_i_size_write(inode, 0);
+fail_trans:
+        nr = trans->blocks_used;
+        ret = btrfs_end_transaction_throttle(trans, root);
+fail:
+        btrfs_btree_balance_dirty(root, nr);
+        if (ret && !err)
+                err = ret;
+        return err;
+}
+#if 0
+/*
+ * when truncating bytes in a file, it is possible to avoid reading
+ * the leaves that contain only checksum items.  This can be the
+ * majority of the IO required to delete a large file, but it must
+ * be done carefully.
+ *
+ * The keys in the level just above the leaves are checked to make sure
+ * the lowest key in a given leaf is a csum key, and starts at an offset
+ * after the new  size.
+ *
+ * Then the key for the next leaf is checked to make sure it also has
+ * a checksum item for the same file.  If it does, we know our target leaf
+ * contains only checksum items, and it can be safely freed without reading
+ * it.
+ *
+ * This is just an optimization targeted at large files.  It may do
+ * nothing.  It will return 0 unless things went badly.
+ */
+static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     struct btrfs_path *path,
+                                     struct inode *inode, u64 new_size)
+{
+        struct btrfs_key key;
+        int ret;
+        int nritems;
+        struct btrfs_key found_key;
+        struct btrfs_key other_key;
+        struct btrfs_leaf_ref *ref;
+        u64 leaf_gen;
+        u64 leaf_start;
+        path->lowest_level = 1;
+        key.objectid = inode->i_ino;
+        key.type = BTRFS_CSUM_ITEM_KEY;
+        key.offset = new_size;
+again:
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
+        if (path->nodes[1] == NULL) {
+                ret = 0;
+                goto out;
+        }
+        ret = 0;
+        btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
+        nritems = btrfs_header_nritems(path->nodes[1]);
+        if (!nritems)
+                goto out;
+        if (path->slots[1] >= nritems)
+                goto next_node;
+        /* did we find a key greater than anything we want to delete? */
+        if (found_key.objectid > inode->i_ino ||
+           (found_key.objectid == inode->i_ino && found_key.type > key.type))
+                goto out;
+        /* we check the next key in the node to make sure the leave contains
+         * only checksum items.  This comparison doesn't work if our
+         * leaf is the last one in the node
+         */
+        if (path->slots[1] + 1 >= nritems) {
+next_node:
+                /* search forward from the last key in the node, this
+                 * will bring us into the next node in the tree
+                 */
+                btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
+                /* unlikely, but we inc below, so check to be safe */
+                if (found_key.offset == (u64)-1)
+                        goto out;
+                /* search_forward needs a path with locks held, do the
+                 * search again for the original key.  It is possible
+                 * this will race with a balance and return a path that
+                 * we could modify, but this drop is just an optimization
+                 * and is allowed to miss some leaves.
+                 */
+                btrfs_release_path(root, path);
+                found_key.offset++;
+                /* setup a max key for search_forward */
+                other_key.offset = (u64)-1;
+                other_key.type = key.type;
+                other_key.objectid = key.objectid;
+                path->keep_locks = 1;
+                ret = btrfs_search_forward(root, &found_key, &other_key,
+                                           path, 0, 0);
+                path->keep_locks = 0;
+                if (ret || found_key.objectid != key.objectid ||
+                    found_key.type != key.type) {
+                        ret = 0;
+                        goto out;
+                }
+                key.offset = found_key.offset;
+                btrfs_release_path(root, path);
+                cond_resched();
+                goto again;
+        }
+        /* we know there's one more slot after us in the tree,
+         * read that key so we can verify it is also a checksum item
+         */
+        btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
+        if (found_key.objectid < inode->i_ino)
+                goto next_key;
+        if (found_key.type != key.type || found_key.offset < new_size)
+                goto next_key;
+        /*
+         * if the key for the next leaf isn't a csum key from this objectid,
+         * we can't be sure there aren't good items inside this leaf.
+         * Bail out
+         */
+        if (other_key.objectid != inode->i_ino || other_key.type != key.type)
+                goto out;
+        leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
+        leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
+        /*
+         * it is safe to delete this leaf, it contains only
+         * csum items from this inode at an offset >= new_size
+         */
+        ret = btrfs_del_leaf(trans, root, path, leaf_start);
+        BUG_ON(ret);
+        if (root->ref_cows && leaf_gen < trans->transid) {
+                ref = btrfs_alloc_leaf_ref(root, 0);
+                if (ref) {
+                        ref->root_gen = root->root_key.offset;
+                        ref->bytenr = leaf_start;
+                        ref->owner = 0;
+                        ref->generation = leaf_gen;
+                        ref->nritems = 0;
+                        ret = btrfs_add_leaf_ref(root, ref, 0);
+                        WARN_ON(ret);
+                        btrfs_free_leaf_ref(root, ref);
+                } else {
+                        WARN_ON(1);
+                }
+        }
+next_key:
+        btrfs_release_path(root, path);
+        if (other_key.objectid == inode->i_ino &&
+            other_key.type == key.type && other_key.offset > key.offset) {
+                key.offset = other_key.offset;
+                cond_resched();
+                goto again;
+        }
+        ret = 0;
+out:
+        /* fixup any changes we've made to the path */
+        path->lowest_level = 0;
+        path->keep_locks = 0;
+        btrfs_release_path(root, path);
+        return ret;
+}
+#endif
+/*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+ * any higher than new_size
+ *
+ * csum items that cross the new i_size are truncated to the new size
+ * as well.
+ *
+ * min_type is the minimum key type to truncate down to.  If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ */
+noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct inode *inode,
+                                        u64 new_size, u32 min_type)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        u32 found_type;
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *fi;
+        u64 extent_start = 0;
+        u64 extent_num_bytes = 0;
+        u64 item_end = 0;
+        u64 root_gen = 0;
+        u64 root_owner = 0;
+        int found_extent;
+        int del_item;
+        int pending_del_nr = 0;
+        int pending_del_slot = 0;
+        int extent_type = -1;
+        int encoding;
+        u64 mask = root->sectorsize - 1;
+        if (root->ref_cows)
+                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+        path = btrfs_alloc_path();
+        path->reada = -1;
+        BUG_ON(!path);
+        /* FIXME, add redo link to tree so we don't leak on crash */
+        key.objectid = inode->i_ino;
+        key.offset = (u64)-1;
+        key.type = (u8)-1;
+        btrfs_init_path(path);
+search_again:
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto error;
+        if (ret > 0) {
+                /* there are no items in the tree for us to truncate, we're
+                 * done
+                 */
+                if (path->slots[0] == 0) {
+                        ret = 0;
+                        goto error;
+                }
+                path->slots[0]--;
+        }
+        while (1) {
+                fi = NULL;
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                found_type = btrfs_key_type(&found_key);
+                encoding = 0;
+                if (found_key.objectid != inode->i_ino)
+                        break;
+                if (found_type < min_type)
+                        break;
+                item_end = found_key.offset;
+                if (found_type == BTRFS_EXTENT_DATA_KEY) {
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        extent_type = btrfs_file_extent_type(leaf, fi);
+                        encoding = btrfs_file_extent_compression(leaf, fi);
+                        encoding |= btrfs_file_extent_encryption(leaf, fi);
+                        encoding |= btrfs_file_extent_other_encoding(leaf, fi);
+                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+                                item_end +=
+                                    btrfs_file_extent_num_bytes(leaf, fi);
+                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                                item_end += btrfs_file_extent_inline_len(leaf,
+                                                                         fi);
+                        }
+                        item_end--;
+                }
+                if (item_end < new_size) {
+                        if (found_type == BTRFS_DIR_ITEM_KEY)
+                                found_type = BTRFS_INODE_ITEM_KEY;
+                        else if (found_type == BTRFS_EXTENT_ITEM_KEY)
+                                found_type = BTRFS_EXTENT_DATA_KEY;
+                        else if (found_type == BTRFS_EXTENT_DATA_KEY)
+                                found_type = BTRFS_XATTR_ITEM_KEY;
+                        else if (found_type == BTRFS_XATTR_ITEM_KEY)
+                                found_type = BTRFS_INODE_REF_KEY;
+                        else if (found_type)
+                                found_type--;
+                        else
+                                break;
+                        btrfs_set_key_type(&key, found_type);
+                        goto next;
+                }
+                if (found_key.offset >= new_size)
+                        del_item = 1;
+                else
+                        del_item = 0;
+                found_extent = 0;
+                /* FIXME, shrink the extent if the ref count is only 1 */
+                if (found_type != BTRFS_EXTENT_DATA_KEY)
+                        goto delete;
+                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+                        u64 num_dec;
+                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+                        if (!del_item && !encoding) {
+                                u64 orig_num_bytes =
+                                        btrfs_file_extent_num_bytes(leaf, fi);
+                                extent_num_bytes = new_size -
+                                        found_key.offset + root->sectorsize - 1;
+                                extent_num_bytes = extent_num_bytes &
+                                        ~((u64)root->sectorsize - 1);
+                                btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                         extent_num_bytes);
+                                num_dec = (orig_num_bytes -
+                                           extent_num_bytes);
+                                if (root->ref_cows && extent_start != 0)
+                                        inode_sub_bytes(inode, num_dec);
+                                btrfs_mark_buffer_dirty(leaf);
+                        } else {
+                                extent_num_bytes =
+                                        btrfs_file_extent_disk_num_bytes(leaf,
+                                                                         fi);
+                                /* FIXME blocksize != 4096 */
+                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+                                if (extent_start != 0) {
+                                        found_extent = 1;
+                                        if (root->ref_cows)
+                                                inode_sub_bytes(inode, num_dec);
+                                }
+                                root_gen = btrfs_header_generation(leaf);
+                                root_owner = btrfs_header_owner(leaf);
+                        }
+                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                        /*
+                         * we can't truncate inline items that have had
+                         * special encodings
+                         */
+                        if (!del_item &&
+                            btrfs_file_extent_compression(leaf, fi) == 0 &&
+                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
+                            btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+                                u32 size = new_size - found_key.offset;
+                                if (root->ref_cows) {
+                                        inode_sub_bytes(inode, item_end + 1 -
+                                                        new_size);
+                                }
+                                size =
+                                    btrfs_file_extent_calc_inline_size(size);
+                                ret = btrfs_truncate_item(trans, root, path,
+                                                          size, 1);
+                                BUG_ON(ret);
+                        } else if (root->ref_cows) {
+                                inode_sub_bytes(inode, item_end + 1 -
+                                                found_key.offset);
+                        }
+                }
+delete:
+                if (del_item) {
+                        if (!pending_del_nr) {
+                                /* no pending yet, add ourselves */
+                                pending_del_slot = path->slots[0];
+                                pending_del_nr = 1;
+                        } else if (pending_del_nr &&
+                                   path->slots[0] + 1 == pending_del_slot) {
+                                /* hop on the pending chunk */
+                                pending_del_nr++;
+                                pending_del_slot = path->slots[0];
+                        } else {
+                                BUG();
+                        }
+                } else {
+                        break;
+                }
+                if (found_extent) {
+                        ret = btrfs_free_extent(trans, root, extent_start,
+                                                extent_num_bytes,
+                                                leaf->start, root_owner,
+                                                root_gen, inode->i_ino, 0);
+                        BUG_ON(ret);
+                }
+next:
+                if (path->slots[0] == 0) {
+                        if (pending_del_nr)
+                                goto del_pending;
+                        btrfs_release_path(root, path);
+                        goto search_again;
+                }
+                path->slots[0]--;
+                if (pending_del_nr &&
+                    path->slots[0] + 1 != pending_del_slot) {
+                        struct btrfs_key debug;
+del_pending:
+                        btrfs_item_key_to_cpu(path->nodes[0], &debug,
+                                              pending_del_slot);
+                        ret = btrfs_del_items(trans, root, path,
+                                              pending_del_slot,
+                                              pending_del_nr);
+                        BUG_ON(ret);
+                        pending_del_nr = 0;
+                        btrfs_release_path(root, path);
+                        goto search_again;
+                }
+        }
+        ret = 0;
+error:
+        if (pending_del_nr) {
+                ret = btrfs_del_items(trans, root, path, pending_del_slot,
+                                      pending_del_nr);
+        }
+        btrfs_free_path(path);
+        inode->i_sb->s_dirt = 1;
+        return ret;
+}
+/*
+ * taken from block_truncate_page, but does cow as it zeros out
+ * any bytes left in the last page in the file.
+ */
+static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+{
+        struct inode *inode = mapping->host;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_ordered_extent *ordered;
+        char *kaddr;
+        u32 blocksize = root->sectorsize;
+        pgoff_t index = from >> PAGE_CACHE_SHIFT;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        struct page *page;
+        int ret = 0;
+        u64 page_start;
+        u64 page_end;
+        if ((offset & (blocksize - 1)) == 0)
+                goto out;
+        ret = -ENOMEM;
+again:
+        page = grab_cache_page(mapping, index);
+        if (!page)
+                goto out;
+        page_start = page_offset(page);
+        page_end = page_start + PAGE_CACHE_SIZE - 1;
+        if (!PageUptodate(page)) {
+                ret = btrfs_readpage(NULL, page);
+                lock_page(page);
+                if (page->mapping != mapping) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        goto again;
+                }
+                if (!PageUptodate(page)) {
+                        ret = -EIO;
+                        goto out_unlock;
+                }
+        }
+        wait_on_page_writeback(page);
+        lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+        set_page_extent_mapped(page);
+        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+        if (ordered) {
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_page(page);
+                page_cache_release(page);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+                goto again;
+        }
+        btrfs_set_extent_delalloc(inode, page_start, page_end);
+        ret = 0;
+        if (offset != PAGE_CACHE_SIZE) {
+                kaddr = kmap(page);
+                memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+                flush_dcache_page(page);
+                kunmap(page);
+        }
+        ClearPageChecked(page);
+        set_page_dirty(page);
+        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+out_unlock:
+        unlock_page(page);
+        page_cache_release(page);
+out:
+        return ret;
+}
+int btrfs_cont_expand(struct inode *inode, loff_t size)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct extent_map *em;
+        u64 mask = root->sectorsize - 1;
+        u64 hole_start = (inode->i_size + mask) & ~mask;
+        u64 block_end = (size + mask) & ~mask;
+        u64 last_byte;
+        u64 cur_offset;
+        u64 hole_size;
+        int err;
+        if (size <= hole_start)
+                return 0;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                return err;
+        btrfs_truncate_page(inode->i_mapping, inode->i_size);
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                btrfs_wait_ordered_range(inode, hole_start,
+                                         block_end - hole_start);
+                lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+                if (!ordered)
+                        break;
+                unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+                btrfs_put_ordered_extent(ordered);
+        }
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        cur_offset = hole_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                block_end - cur_offset, 0);
+                BUG_ON(IS_ERR(em) || !em);
+                last_byte = min(extent_map_end(em), block_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+                        u64 hint_byte = 0;
+                        hole_size = last_byte - cur_offset;
+                        err = btrfs_drop_extents(trans, root, inode,
+                                                 cur_offset,
+                                                 cur_offset + hole_size,
+                                                 cur_offset, &hint_byte);
+                        if (err)
+                                break;
+                        err = btrfs_insert_file_extent(trans, root,
+                                        inode->i_ino, cur_offset, 0,
+                                        0, hole_size, 0, hole_size,
+                                        0, 0, 0);
+                        btrfs_drop_extent_cache(inode, hole_start,
+                                        last_byte - 1, 0);
+                }
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (err || cur_offset >= block_end)
+                        break;
+        }
+        btrfs_end_transaction(trans, root);
+        unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+        return err;
+}
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        int err;
+        err = inode_change_ok(inode, attr);
+        if (err)
+                return err;
+        if (S_ISREG(inode->i_mode) &&
+            attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+                err = btrfs_cont_expand(inode, attr->ia_size);
+                if (err)
+                        return err;
+        }
+        err = inode_setattr(inode, attr);
+        if (!err && ((attr->ia_valid & ATTR_MODE)))
+                err = btrfs_acl_chmod(inode);
+        return err;
+}
+void btrfs_delete_inode(struct inode *inode)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        unsigned long nr;
+        int ret;
+        truncate_inode_pages(&inode->i_data, 0);
+        if (is_bad_inode(inode)) {
+                btrfs_orphan_del(NULL, inode);
+                goto no_delete;
+        }
+        btrfs_wait_ordered_range(inode, 0, (u64)-1);
+        btrfs_i_size_write(inode, 0);
+        trans = btrfs_join_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
+        if (ret) {
+                btrfs_orphan_del(NULL, inode);
+                goto no_delete_lock;
+        }
+        btrfs_orphan_del(trans, inode);
+        nr = trans->blocks_used;
+        clear_inode(inode);
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        return;
+no_delete_lock:
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+no_delete:
+        clear_inode(inode);
+}
+/*
+ * this returns the key found in the dir entry in the location pointer.
+ * If no dir entries were found, location->objectid is 0.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+                               struct btrfs_key *location)
+{
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        struct btrfs_dir_item *di;
+        struct btrfs_path *path;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        int ret = 0;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+                                    namelen, 0);
+        if (IS_ERR(di))
+                ret = PTR_ERR(di);
+        if (!di || IS_ERR(di))
+                goto out_err;
+        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+out:
+        btrfs_free_path(path);
+        return ret;
+out_err:
+        location->objectid = 0;
+        goto out;
+}
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root.  This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_root *root,
+                             struct btrfs_key *location,
+                             struct btrfs_root **sub_root,
+                             struct dentry *dentry)
+{
+        struct btrfs_root_item *ri;
+        if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+                return 0;
+        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+                return 0;
+        *sub_root = btrfs_read_fs_root(root->fs_info, location,
+                                        dentry->d_name.name,
+                                        dentry->d_name.len);
+        if (IS_ERR(*sub_root))
+                return PTR_ERR(*sub_root);
+        ri = &(*sub_root)->root_item;
+        location->objectid = btrfs_root_dirid(ri);
+        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+        location->offset = 0;
+        return 0;
+}
+static noinline void init_btrfs_i(struct inode *inode)
+{
+        struct btrfs_inode *bi = BTRFS_I(inode);
+        bi->i_acl = NULL;
+        bi->i_default_acl = NULL;
+        bi->generation = 0;
+        bi->sequence = 0;
+        bi->last_trans = 0;
+        bi->logged_trans = 0;
+        bi->delalloc_bytes = 0;
+        bi->disk_i_size = 0;
+        bi->flags = 0;
+        bi->index_cnt = (u64)-1;
+        bi->log_dirty_trans = 0;
+        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
+                             inode->i_mapping, GFP_NOFS);
+        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+                             inode->i_mapping, GFP_NOFS);
+        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+        mutex_init(&BTRFS_I(inode)->extent_mutex);
+        mutex_init(&BTRFS_I(inode)->log_mutex);
+}
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+        struct btrfs_iget_args *args = p;
+        inode->i_ino = args->ino;
+        init_btrfs_i(inode);
+        BTRFS_I(inode)->root = args->root;
+        return 0;
+}
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+        struct btrfs_iget_args *args = opaque;
+        return args->ino == inode->i_ino &&
+                args->root == BTRFS_I(inode)->root;
+}
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+                            struct btrfs_root *root, int wait)
+{
+        struct inode *inode;
+        struct btrfs_iget_args args;
+        args.ino = objectid;
+        args.root = root;
+        if (wait) {
+                inode = ilookup5(s, objectid, btrfs_find_actor,
+                                 (void *)&args);
+        } else {
+                inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
+                                        (void *)&args);
+        }
+        return inode;
+}
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+                                struct btrfs_root *root)
+{
+        struct inode *inode;
+        struct btrfs_iget_args args;
+        args.ino = objectid;
+        args.root = root;
+        inode = iget5_locked(s, objectid, btrfs_find_actor,
+                             btrfs_init_locked_inode,
+                             (void *)&args);
+        return inode;
+}
+/* Get an inode object given its location and corresponding root.
+ * Returns in *is_new if the inode was read from disk
+ */
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+                         struct btrfs_root *root, int *is_new)
+{
+        struct inode *inode;
+        inode = btrfs_iget_locked(s, location->objectid, root);
+        if (!inode)
+                return ERR_PTR(-EACCES);
+        if (inode->i_state & I_NEW) {
+                BTRFS_I(inode)->root = root;
+                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
+                btrfs_read_locked_inode(inode);
+                unlock_new_inode(inode);
+                if (is_new)
+                        *is_new = 1;
+        } else {
+                if (is_new)
+                        *is_new = 0;
+        }
+        return inode;
+}
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode;
+        struct btrfs_inode *bi = BTRFS_I(dir);
+        struct btrfs_root *root = bi->root;
+        struct btrfs_root *sub_root = root;
+        struct btrfs_key location;
+        int ret, new;
+        if (dentry->d_name.len > BTRFS_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        ret = btrfs_inode_by_name(dir, dentry, &location);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        inode = NULL;
+        if (location.objectid) {
+                ret = fixup_tree_root_location(root, &location, &sub_root,
+                                                dentry);
+                if (ret < 0)
+                        return ERR_PTR(ret);
+                if (ret > 0)
+                        return ERR_PTR(-ENOENT);
+                inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+                if (IS_ERR(inode))
+                        return ERR_CAST(inode);
+        }
+        return inode;
+}
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+                                   struct nameidata *nd)
+{
+        struct inode *inode;
+        if (dentry->d_name.len > BTRFS_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        inode = btrfs_lookup_dentry(dir, dentry);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        return d_splice_alias(inode, dentry);
+}
+static unsigned char btrfs_filetype_table[] = {
+        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+static int btrfs_real_readdir(struct file *filp, void *dirent,
+                              filldir_t filldir)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_item *item;
+        struct btrfs_dir_item *di;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_path *path;
+        int ret;
+        u32 nritems;
+        struct extent_buffer *leaf;
+        int slot;
+        int advance;
+        unsigned char d_type;
+        int over = 0;
+        u32 di_cur;
+        u32 di_total;
+        u32 di_len;
+        int key_type = BTRFS_DIR_INDEX_KEY;
+        char tmp_name[32];
+        char *name_ptr;
+        int name_len;
+        /* FIXME, use a real flag for deciding about the key type */
+        if (root->fs_info->tree_root == root)
+                key_type = BTRFS_DIR_ITEM_KEY;
+        /* special case for "." */
+        if (filp->f_pos == 0) {
+                over = filldir(dirent, ".", 1,
+                               1, inode->i_ino,
+                               DT_DIR);
+                if (over)
+                        return 0;
+                filp->f_pos = 1;
+        }
+        /* special case for .., just use the back ref */
+        if (filp->f_pos == 1) {
+                u64 pino = parent_ino(filp->f_path.dentry);
+                over = filldir(dirent, "..", 2,
+                               2, pino, DT_DIR);
+                if (over)
+                        return 0;
+                filp->f_pos = 2;
+        }
+        path = btrfs_alloc_path();
+        path->reada = 2;
+        btrfs_set_key_type(&key, key_type);
+        key.offset = filp->f_pos;
+        key.objectid = inode->i_ino;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto err;
+        advance = 0;
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                slot = path->slots[0];
+                if (advance || slot >= nritems) {
+                        if (slot >= nritems - 1) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret)
+                                        break;
+                                leaf = path->nodes[0];
+                                nritems = btrfs_header_nritems(leaf);
+                                slot = path->slots[0];
+                        } else {
+                                slot++;
+                                path->slots[0]++;
+                        }
+                }
+                advance = 1;
+                item = btrfs_item_nr(leaf, slot);
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (found_key.objectid != key.objectid)
+                        break;
+                if (btrfs_key_type(&found_key) != key_type)
+                        break;
+                if (found_key.offset < filp->f_pos)
+                        continue;
+                filp->f_pos = found_key.offset;
+                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+                di_cur = 0;
+                di_total = btrfs_item_size(leaf, item);
+                while (di_cur < di_total) {
+                        struct btrfs_key location;
+                        name_len = btrfs_dir_name_len(leaf, di);
+                        if (name_len <= sizeof(tmp_name)) {
+                                name_ptr = tmp_name;
+                        } else {
+                                name_ptr = kmalloc(name_len, GFP_NOFS);
+                                if (!name_ptr) {
+                                        ret = -ENOMEM;
+                                        goto err;
+                                }
+                        }
+                        read_extent_buffer(leaf, name_ptr,
+                                           (unsigned long)(di + 1), name_len);
+                        d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+                        btrfs_dir_item_key_to_cpu(leaf, di, &location);
+                        /* is this a reference to our own snapshot? If so
+                         * skip it
+                         */
+                        if (location.type == BTRFS_ROOT_ITEM_KEY &&
+                            location.objectid == root->root_key.objectid) {
+                                over = 0;
+                                goto skip;
+                        }
+                        over = filldir(dirent, name_ptr, name_len,
+                                       found_key.offset, location.objectid,
+                                       d_type);
+skip:
+                        if (name_ptr != tmp_name)
+                                kfree(name_ptr);
+                        if (over)
+                                goto nopos;
+                        di_len = btrfs_dir_name_len(leaf, di) +
+                                 btrfs_dir_data_len(leaf, di) + sizeof(*di);
+                        di_cur += di_len;
+                        di = (struct btrfs_dir_item *)((char *)di + di_len);
+                }
+        }
+        /* Reached end of directory/root. Bump pos past the last item. */
+        if (key_type == BTRFS_DIR_INDEX_KEY)
+                filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
+        else
+                filp->f_pos++;
+nopos:
+        ret = 0;
+err:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_write_inode(struct inode *inode, int wait)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        if (root->fs_info->btree_inode == inode)
+                return 0;
+        if (wait) {
+                trans = btrfs_join_transaction(root, 1);
+                btrfs_set_trans_block_group(trans, inode);
+                ret = btrfs_commit_transaction(trans, root);
+        }
+        return ret;
+}
+/*
+ * This is somewhat expensive, updating the tree every time the
+ * inode changes.  But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+void btrfs_dirty_inode(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        trans = btrfs_join_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        btrfs_update_inode(trans, root, inode);
+        btrfs_end_transaction(trans, root);
+}
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_key key, found_key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        int ret;
+        key.objectid = inode->i_ino;
+        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+        key.offset = (u64)-1;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        /* FIXME: we should be able to handle this */
+        if (ret == 0)
+                goto out;
+        ret = 0;
+        /*
+         * MAGIC NUMBER EXPLANATION:
+         * since we search a directory based on f_pos we have to start at 2
+         * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+         * else has to start at 2
+         */
+        if (path->slots[0] == 0) {
+                BTRFS_I(inode)->index_cnt = 2;
+                goto out;
+        }
+        path->slots[0]--;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        if (found_key.objectid != inode->i_ino ||
+            btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+                BTRFS_I(inode)->index_cnt = 2;
+                goto out;
+        }
+        BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to find a free sequence number in a given directory.  This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
+int btrfs_set_inode_index(struct inode *dir, u64 *index)
+{
+        int ret = 0;
+        if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+                ret = btrfs_set_inode_index_count(dir);
+                if (ret)
+                        return ret;
+        }
+        *index = BTRFS_I(dir)->index_cnt;
+        BTRFS_I(dir)->index_cnt++;
+        return ret;
+}
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     struct inode *dir,
+                                     const char *name, int name_len,
+                                     u64 ref_objectid, u64 objectid,
+                                     u64 alloc_hint, int mode, u64 *index)
+{
+        struct inode *inode;
+        struct btrfs_inode_item *inode_item;
+        struct btrfs_key *location;
+        struct btrfs_path *path;
+        struct btrfs_inode_ref *ref;
+        struct btrfs_key key[2];
+        u32 sizes[2];
+        unsigned long ptr;
+        int ret;
+        int owner;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        inode = new_inode(root->fs_info->sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (dir) {
+                ret = btrfs_set_inode_index(dir, index);
+                if (ret)
+                        return ERR_PTR(ret);
+        }
+        /*
+         * index_cnt is ignored for everything but a dir,
+         * btrfs_get_inode_index_count has an explanation for the magic
+         * number
+         */
+        init_btrfs_i(inode);
+        BTRFS_I(inode)->index_cnt = 2;
+        BTRFS_I(inode)->root = root;
+        BTRFS_I(inode)->generation = trans->transid;
+        if (mode & S_IFDIR)
+                owner = 0;
+        else
+                owner = 1;
+        BTRFS_I(inode)->block_group =
+                        btrfs_find_block_group(root, 0, alloc_hint, owner);
+        if ((mode & S_IFREG)) {
+                if (btrfs_test_opt(root, NODATASUM))
+                        btrfs_set_flag(inode, NODATASUM);
+                if (btrfs_test_opt(root, NODATACOW))
+                        btrfs_set_flag(inode, NODATACOW);
+        }
+        key[0].objectid = objectid;
+        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+        key[0].offset = 0;
+        key[1].objectid = objectid;
+        btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+        key[1].offset = ref_objectid;
+        sizes[0] = sizeof(struct btrfs_inode_item);
+        sizes[1] = name_len + sizeof(*ref);
+        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+        if (ret != 0)
+                goto fail;
+        if (objectid > root->highest_inode)
+                root->highest_inode = objectid;
+        inode->i_uid = current_fsuid();
+        inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+        inode->i_ino = objectid;
+        inode_set_bytes(inode, 0);
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                  struct btrfs_inode_item);
+        fill_inode_item(trans, path->nodes[0], inode_item, inode);
+        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+                             struct btrfs_inode_ref);
+        btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+        btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+        ptr = (unsigned long)(ref + 1);
+        write_extent_buffer(path->nodes[0], name, ptr, name_len);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_free_path(path);
+        location = &BTRFS_I(inode)->location;
+        location->objectid = objectid;
+        location->offset = 0;
+        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+        insert_inode_hash(inode);
+        return inode;
+fail:
+        if (dir)
+                BTRFS_I(dir)->index_cnt--;
+        btrfs_free_path(path);
+        return ERR_PTR(ret);
+}
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+        return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+                   struct inode *parent_inode, struct inode *inode,
+                   const char *name, int name_len, int add_backref, u64 index)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+        key.objectid = inode->i_ino;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        ret = btrfs_insert_dir_item(trans, root, name, name_len,
+                                    parent_inode->i_ino,
+                                    &key, btrfs_inode_type(inode),
+                                    index);
+        if (ret == 0) {
+                if (add_backref) {
+                        ret = btrfs_insert_inode_ref(trans, root,
+                                                     name, name_len,
+                                                     inode->i_ino,
+                                                     parent_inode->i_ino,
+                                                     index);
+                }
+                btrfs_i_size_write(parent_inode, parent_inode->i_size +
+                                   name_len * 2);
+                parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+                ret = btrfs_update_inode(trans, root, parent_inode);
+        }
+        return ret;
+}
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+                            struct dentry *dentry, struct inode *inode,
+                            int backref, u64 index)
+{
+        int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+                                 inode, dentry->d_name.name,
+                                 dentry->d_name.len, backref, index);
+        if (!err) {
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        if (err > 0)
+                err = -EEXIST;
+        return err;
+}
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+                        int mode, dev_t rdev)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct inode *inode = NULL;
+        int err;
+        int drop_inode = 0;
+        u64 objectid;
+        unsigned long nr = 0;
+        u64 index = 0;
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+        if (err) {
+                err = -ENOSPC;
+                goto out_unlock;
+        }
+        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+                                dentry->d_name.len,
+                                dentry->d_parent->d_inode->i_ino, objectid,
+                                BTRFS_I(dir)->block_group, mode, &index);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_unlock;
+        err = btrfs_init_acl(inode, dir);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
+        btrfs_set_trans_block_group(trans, inode);
+        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        if (err)
+                drop_inode = 1;
+        else {
+                inode->i_op = &btrfs_special_inode_operations;
+                init_special_inode(inode, inode->i_mode, rdev);
+                btrfs_update_inode(trans, root, inode);
+        }
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, inode);
+        btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+fail:
+        if (drop_inode) {
+                inode_dec_link_count(inode);
+                iput(inode);
+        }
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+                        int mode, struct nameidata *nd)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct inode *inode = NULL;
+        int err;
+        int drop_inode = 0;
+        unsigned long nr = 0;
+        u64 objectid;
+        u64 index = 0;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+        if (err) {
+                err = -ENOSPC;
+                goto out_unlock;
+        }
+        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+                                dentry->d_name.len,
+                                dentry->d_parent->d_inode->i_ino,
+                                objectid, BTRFS_I(dir)->block_group, mode,
+                                &index);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_unlock;
+        err = btrfs_init_acl(inode, dir);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
+        btrfs_set_trans_block_group(trans, inode);
+        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        if (err)
+                drop_inode = 1;
+        else {
+                inode->i_mapping->a_ops = &btrfs_aops;
+                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+                inode->i_fop = &btrfs_file_operations;
+                inode->i_op = &btrfs_file_inode_operations;
+                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        }
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, inode);
+        btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+fail:
+        if (drop_inode) {
+                inode_dec_link_count(inode);
+                iput(inode);
+        }
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+                      struct dentry *dentry)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct inode *inode = old_dentry->d_inode;
+        u64 index;
+        unsigned long nr = 0;
+        int err;
+        int drop_inode = 0;
+        if (inode->i_nlink == 0)
+                return -ENOENT;
+        btrfs_inc_nlink(inode);
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto fail;
+        err = btrfs_set_inode_index(dir, &index);
+        if (err)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        atomic_inc(&inode->i_count);
+        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+        if (err)
+                drop_inode = 1;
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, dir);
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
+                drop_inode = 1;
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+fail:
+        if (drop_inode) {
+                inode_dec_link_count(inode);
+                iput(inode);
+        }
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct inode *inode = NULL;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        int err = 0;
+        int drop_on_err = 0;
+        u64 objectid = 0;
+        u64 index = 0;
+        unsigned long nr = 1;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto out_unlock;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out_unlock;
+        }
+        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+        if (err) {
+                err = -ENOSPC;
+                goto out_unlock;
+        }
+        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+                                dentry->d_name.len,
+                                dentry->d_parent->d_inode->i_ino, objectid,
+                                BTRFS_I(dir)->block_group, S_IFDIR | mode,
+                                &index);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                goto out_fail;
+        }
+        drop_on_err = 1;
+        err = btrfs_init_acl(inode, dir);
+        if (err)
+                goto out_fail;
+        inode->i_op = &btrfs_dir_inode_operations;
+        inode->i_fop = &btrfs_dir_file_operations;
+        btrfs_set_trans_block_group(trans, inode);
+        btrfs_i_size_write(inode, 0);
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
+                goto out_fail;
+        err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+                                 inode, dentry->d_name.name,
+                                 dentry->d_name.len, 0, index);
+        if (err)
+                goto out_fail;
+        d_instantiate(dentry, inode);
+        drop_on_err = 0;
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, inode);
+        btrfs_update_inode_block_group(trans, dir);
+out_fail:
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+out_unlock:
+        if (drop_on_err)
+                iput(inode);
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+/* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the new extent into the tree.
+ */
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+                                struct extent_map *existing,
+                                struct extent_map *em,
+                                u64 map_start, u64 map_len)
+{
+        u64 start_diff;
+        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+        start_diff = map_start - em->start;
+        em->start = map_start;
+        em->len = map_len;
+        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+                em->block_start += start_diff;
+                em->block_len -= start_diff;
+        }
+        return add_extent_mapping(em_tree, em);
+}
+static noinline int uncompress_inline(struct btrfs_path *path,
+                                      struct inode *inode, struct page *page,
+                                      size_t pg_offset, u64 extent_offset,
+                                      struct btrfs_file_extent_item *item)
+{
+        int ret;
+        struct extent_buffer *leaf = path->nodes[0];
+        char *tmp;
+        size_t max_size;
+        unsigned long inline_size;
+        unsigned long ptr;
+        WARN_ON(pg_offset != 0);
+        max_size = btrfs_file_extent_ram_bytes(leaf, item);
+        inline_size = btrfs_file_extent_inline_item_len(leaf,
+                                        btrfs_item_nr(leaf, path->slots[0]));
+        tmp = kmalloc(inline_size, GFP_NOFS);
+        ptr = btrfs_file_extent_inline_start(item);
+        read_extent_buffer(leaf, tmp, ptr, inline_size);
+        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+        ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+                                    inline_size, max_size);
+        if (ret) {
+                char *kaddr = kmap_atomic(page, KM_USER0);
+                unsigned long copy_size = min_t(u64,
+                                  PAGE_CACHE_SIZE - pg_offset,
+                                  max_size - extent_offset);
+                memset(kaddr + pg_offset, 0, copy_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        kfree(tmp);
+        return 0;
+}
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation.  This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+                                    size_t pg_offset, u64 start, u64 len,
+                                    int create)
+{
+        int ret;
+        int err = 0;
+        u64 bytenr;
+        u64 extent_start = 0;
+        u64 extent_end = 0;
+        u64 objectid = inode->i_ino;
+        u32 found_type;
+        struct btrfs_path *path = NULL;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_file_extent_item *item;
+        struct extent_buffer *leaf;
+        struct btrfs_key found_key;
+        struct extent_map *em = NULL;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_trans_handle *trans = NULL;
+        int compressed;
+again:
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, start, len);
+        if (em)
+                em->bdev = root->fs_info->fs_devices->latest_bdev;
+        spin_unlock(&em_tree->lock);
+        if (em) {
+                if (em->start > start || em->start + em->len <= start)
+                        free_extent_map(em);
+                else if (em->block_start == EXTENT_MAP_INLINE && page)
+                        free_extent_map(em);
+                else
+                        goto out;
+        }
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                err = -ENOMEM;
+                goto out;
+        }
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        em->start = EXTENT_MAP_HOLE;
+        em->orig_start = EXTENT_MAP_HOLE;
+        em->len = (u64)-1;
+        em->block_len = (u64)-1;
+        if (!path) {
+                path = btrfs_alloc_path();
+                BUG_ON(!path);
+        }
+        ret = btrfs_lookup_file_extent(trans, root, path,
+                                       objectid, start, trans != NULL);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret != 0) {
+                if (path->slots[0] == 0)
+                        goto not_found;
+                path->slots[0]--;
+        }
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0],
+                              struct btrfs_file_extent_item);
+        /* are we inside the extent that was found? */
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        found_type = btrfs_key_type(&found_key);
+        if (found_key.objectid != objectid ||
+            found_type != BTRFS_EXTENT_DATA_KEY) {
+                goto not_found;
+        }
+        found_type = btrfs_file_extent_type(leaf, item);
+        extent_start = found_key.offset;
+        compressed = btrfs_file_extent_compression(leaf, item);
+        if (found_type == BTRFS_FILE_EXTENT_REG ||
+            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                extent_end = extent_start +
+                       btrfs_file_extent_num_bytes(leaf, item);
+        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                size_t size;
+                size = btrfs_file_extent_inline_len(leaf, item);
+                extent_end = (extent_start + size + root->sectorsize - 1) &
+                        ~((u64)root->sectorsize - 1);
+        }
+        if (start >= extent_end) {
+                path->slots[0]++;
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0) {
+                                err = ret;
+                                goto out;
+                        }
+                        if (ret > 0)
+                                goto not_found;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid != objectid ||
+                    found_key.type != BTRFS_EXTENT_DATA_KEY)
+                        goto not_found;
+                if (start + len <= found_key.offset)
+                        goto not_found;
+                em->start = start;
+                em->len = found_key.offset - start;
+                goto not_found_em;
+        }
+        if (found_type == BTRFS_FILE_EXTENT_REG ||
+            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                em->start = extent_start;
+                em->len = extent_end - extent_start;
+                em->orig_start = extent_start -
+                                 btrfs_file_extent_offset(leaf, item);
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+                if (bytenr == 0) {
+                        em->block_start = EXTENT_MAP_HOLE;
+                        goto insert;
+                }
+                if (compressed) {
+                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                        em->block_start = bytenr;
+                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                         item);
+                } else {
+                        bytenr += btrfs_file_extent_offset(leaf, item);
+                        em->block_start = bytenr;
+                        em->block_len = em->len;
+                        if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
+                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                }
+                goto insert;
+        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                unsigned long ptr;
+                char *map;
+                size_t size;
+                size_t extent_offset;
+                size_t copy_size;
+                em->block_start = EXTENT_MAP_INLINE;
+                if (!page || create) {
+                        em->start = extent_start;
+                        em->len = extent_end - extent_start;
+                        goto out;
+                }
+                size = btrfs_file_extent_inline_len(leaf, item);
+                extent_offset = page_offset(page) + pg_offset - extent_start;
+                copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
+                                size - extent_offset);
+                em->start = extent_start + extent_offset;
+                em->len = (copy_size + root->sectorsize - 1) &
+                        ~((u64)root->sectorsize - 1);
+                em->orig_start = EXTENT_MAP_INLINE;
+                if (compressed)
+                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+                if (create == 0 && !PageUptodate(page)) {
+                        if (btrfs_file_extent_compression(leaf, item) ==
+                            BTRFS_COMPRESS_ZLIB) {
+                                ret = uncompress_inline(path, inode, page,
+                                                        pg_offset,
+                                                        extent_offset, item);
+                                BUG_ON(ret);
+                        } else {
+                                map = kmap(page);
+                                read_extent_buffer(leaf, map + pg_offset, ptr,
+                                                   copy_size);
+                                kunmap(page);
+                        }
+                        flush_dcache_page(page);
+                } else if (create && PageUptodate(page)) {
+                        if (!trans) {
+                                kunmap(page);
+                                free_extent_map(em);
+                                em = NULL;
+                                btrfs_release_path(root, path);
+                                trans = btrfs_join_transaction(root, 1);
+                                goto again;
+                        }
+                        map = kmap(page);
+                        write_extent_buffer(leaf, map + pg_offset, ptr,
+                                            copy_size);
+                        kunmap(page);
+                        btrfs_mark_buffer_dirty(leaf);
+                }
+                set_extent_uptodate(io_tree, em->start,
+                                    extent_map_end(em) - 1, GFP_NOFS);
+                goto insert;
+        } else {
+                printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
+                WARN_ON(1);
+        }
+not_found:
+        em->start = start;
+        em->len = len;
+not_found_em:
+        em->block_start = EXTENT_MAP_HOLE;
+        set_bit(EXTENT_FLAG_VACANCY, &em->flags);
+insert:
+        btrfs_release_path(root, path);
+        if (em->start > start || extent_map_end(em) <= start) {
+                printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
+                       "[%llu %llu]\n", (unsigned long long)em->start,
+                       (unsigned long long)em->len,
+                       (unsigned long long)start,
+                       (unsigned long long)len);
+                err = -EIO;
+                goto out;
+        }
+        err = 0;
+        spin_lock(&em_tree->lock);
+        ret = add_extent_mapping(em_tree, em);
+        /* it is possible that someone inserted the extent into the tree
+         * while we had the lock dropped.  It is also possible that
+         * an overlapping map exists in the tree
+         */
+        if (ret == -EEXIST) {
+                struct extent_map *existing;
+                ret = 0;
+                existing = lookup_extent_mapping(em_tree, start, len);
+                if (existing && (existing->start > start ||
+                    existing->start + existing->len <= start)) {
+                        free_extent_map(existing);
+                        existing = NULL;
+                }
+                if (!existing) {
+                        existing = lookup_extent_mapping(em_tree, em->start,
+                                                         em->len);
+                        if (existing) {
+                                err = merge_extent_mapping(em_tree, existing,
+                                                           em, start,
+                                                           root->sectorsize);
+                                free_extent_map(existing);
+                                if (err) {
+                                        free_extent_map(em);
+                                        em = NULL;
+                                }
+                        } else {
+                                err = -EIO;
+                                free_extent_map(em);
+                                em = NULL;
+                        }
+                } else {
+                        free_extent_map(em);
+                        em = existing;
+                        err = 0;
+                }
+        }
+        spin_unlock(&em_tree->lock);
+out:
+        if (path)
+                btrfs_free_path(path);
+        if (trans) {
+                ret = btrfs_end_transaction(trans, root);
+                if (!err)
+                        err = ret;
+        }
+        if (err) {
+                free_extent_map(em);
+                WARN_ON(1);
+                return ERR_PTR(err);
+        }
+        return em;
+}
+static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
+                        const struct iovec *iov, loff_t offset,
+                        unsigned long nr_segs)
+{
+        return -EINVAL;
+}
+static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
+{
+        return extent_bmap(mapping, iblock, btrfs_get_extent);
+}
+int btrfs_readpage(struct file *file, struct page *page)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        return extent_read_full_page(tree, page, btrfs_get_extent);
+}
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct extent_io_tree *tree;
+        if (current->flags & PF_MEMALLOC) {
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return 0;
+        }
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+}
+int btrfs_writepages(struct address_space *mapping,
+                     struct writeback_control *wbc)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(mapping->host)->io_tree;
+        return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+}
+static int
+btrfs_readpages(struct file *file, struct address_space *mapping,
+                struct list_head *pages, unsigned nr_pages)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(mapping->host)->io_tree;
+        return extent_readpages(tree, mapping, pages, nr_pages,
+                                btrfs_get_extent);
+}
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+        struct extent_io_tree *tree;
+        struct extent_map_tree *map;
+        int ret;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        map = &BTRFS_I(page->mapping->host)->extent_tree;
+        ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+        if (ret == 1) {
+                ClearPagePrivate(page);
+                set_page_private(page, 0);
+                page_cache_release(page);
+        }
+        return ret;
+}
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+        if (PageWriteback(page) || PageDirty(page))
+                return 0;
+        return __btrfs_releasepage(page, gfp_flags);
+}
+static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct extent_io_tree *tree;
+        struct btrfs_ordered_extent *ordered;
+        u64 page_start = page_offset(page);
+        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+        wait_on_page_writeback(page);
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (offset) {
+                btrfs_releasepage(page, GFP_NOFS);
+                return;
+        }
+        lock_extent(tree, page_start, page_end, GFP_NOFS);
+        ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+                                           page_offset(page));
+        if (ordered) {
+                /*
+                 * IO on this page will never be started, so we need
+                 * to account for any ordered extents now
+                 */
+                clear_extent_bit(tree, page_start, page_end,
+                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 EXTENT_LOCKED, 1, 0, GFP_NOFS);
+                btrfs_finish_ordered_io(page->mapping->host,
+                                        page_start, page_end);
+                btrfs_put_ordered_extent(ordered);
+                lock_extent(tree, page_start, page_end, GFP_NOFS);
+        }
+        clear_extent_bit(tree, page_start, page_end,
+                 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+                 EXTENT_ORDERED,
+                 1, 1, GFP_NOFS);
+        __btrfs_releasepage(page, GFP_NOFS);
+        ClearPageChecked(page);
+        if (PagePrivate(page)) {
+                ClearPagePrivate(page);
+                set_page_private(page, 0);
+                page_cache_release(page);
+        }
+}
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+        struct inode *inode = fdentry(vma->vm_file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_ordered_extent *ordered;
+        char *kaddr;
+        unsigned long zero_start;
+        loff_t size;
+        int ret;
+        u64 page_start;
+        u64 page_end;
+        ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+        if (ret)
+                goto out;
+        ret = -EINVAL;
+again:
+        lock_page(page);
+        size = i_size_read(inode);
+        page_start = page_offset(page);
+        page_end = page_start + PAGE_CACHE_SIZE - 1;
+        if ((page->mapping != inode->i_mapping) ||
+            (page_start >= size)) {
+                /* page got truncated out from underneath us */
+                goto out_unlock;
+        }
+        wait_on_page_writeback(page);
+        lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+        set_page_extent_mapped(page);
+        /*
+         * we can't set the delalloc bits if there are pending ordered
+         * extents.  Drop our locks and wait for them to finish
+         */
+        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+        if (ordered) {
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_page(page);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+                goto again;
+        }
+        btrfs_set_extent_delalloc(inode, page_start, page_end);
+        ret = 0;
+        /* page is wholly or partially inside EOF */
+        if (page_start + PAGE_CACHE_SIZE > size)
+                zero_start = size & ~PAGE_CACHE_MASK;
+        else
+                zero_start = PAGE_CACHE_SIZE;
+        if (zero_start != PAGE_CACHE_SIZE) {
+                kaddr = kmap(page);
+                memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+                flush_dcache_page(page);
+                kunmap(page);
+        }
+        ClearPageChecked(page);
+        set_page_dirty(page);
+        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+out_unlock:
+        unlock_page(page);
+out:
+        return ret;
+}
+static void btrfs_truncate(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr;
+        u64 mask = root->sectorsize - 1;
+        if (!S_ISREG(inode->i_mode))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        btrfs_truncate_page(inode->i_mapping, inode->i_size);
+        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        btrfs_i_size_write(inode, inode->i_size);
+        ret = btrfs_orphan_add(trans, inode);
+        if (ret)
+                goto out;
+        /* FIXME, add redo link to tree so we don't leak on crash */
+        ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
+                                      BTRFS_EXTENT_DATA_KEY);
+        btrfs_update_inode(trans, root, inode);
+        ret = btrfs_orphan_del(trans, inode);
+        BUG_ON(ret);
+out:
+        nr = trans->blocks_used;
+        ret = btrfs_end_transaction_throttle(trans, root);
+        BUG_ON(ret);
+        btrfs_btree_balance_dirty(root, nr);
+}
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *new_root, struct dentry *dentry,
+                             u64 new_dirid, u64 alloc_hint)
+{
+        struct inode *inode;
+        int error;
+        u64 index = 0;
+        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
+                                new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &btrfs_dir_inode_operations;
+        inode->i_fop = &btrfs_dir_file_operations;
+        inode->i_nlink = 1;
+        btrfs_i_size_write(inode, 0);
+        error = btrfs_update_inode(trans, new_root, inode);
+        if (error)
+                return error;
+        d_instantiate(dentry, inode);
+        return 0;
+}
+/* helper function for file defrag and space balancing.  This
+ * forces readahead on a given range of bytes in an inode
+ */
+unsigned long btrfs_force_ra(struct address_space *mapping,
+                              struct file_ra_state *ra, struct file *file,
+                              pgoff_t offset, pgoff_t last_index)
+{
+        pgoff_t req_size = last_index - offset + 1;
+        page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+        return offset + req_size;
+}
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+        struct btrfs_inode *ei;
+        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+        if (!ei)
+                return NULL;
+        ei->last_trans = 0;
+        ei->logged_trans = 0;
+        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+        ei->i_acl = BTRFS_ACL_NOT_CACHED;
+        ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
+        INIT_LIST_HEAD(&ei->i_orphan);
+        return &ei->vfs_inode;
+}
+void btrfs_destroy_inode(struct inode *inode)
+{
+        struct btrfs_ordered_extent *ordered;
+        WARN_ON(!list_empty(&inode->i_dentry));
+        WARN_ON(inode->i_data.nrpages);
+        if (BTRFS_I(inode)->i_acl &&
+            BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
+                posix_acl_release(BTRFS_I(inode)->i_acl);
+        if (BTRFS_I(inode)->i_default_acl &&
+            BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
+                posix_acl_release(BTRFS_I(inode)->i_default_acl);
+        spin_lock(&BTRFS_I(inode)->root->list_lock);
+        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+                printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
+                       " list\n", inode->i_ino);
+                dump_stack();
+        }
+        spin_unlock(&BTRFS_I(inode)->root->list_lock);
+        while (1) {
+                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+                if (!ordered)
+                        break;
+                else {
+                        printk(KERN_ERR "btrfs found ordered "
+                               "extent %llu %llu on inode cleanup\n",
+                               (unsigned long long)ordered->file_offset,
+                               (unsigned long long)ordered->len);
+                        btrfs_remove_ordered_extent(inode, ordered);
+                        btrfs_put_ordered_extent(ordered);
+                        btrfs_put_ordered_extent(ordered);
+                }
+        }
+        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+static void init_once(void *foo)
+{
+        struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+        inode_init_once(&ei->vfs_inode);
+}
+void btrfs_destroy_cachep(void)
+{
+        if (btrfs_inode_cachep)
+                kmem_cache_destroy(btrfs_inode_cachep);
+        if (btrfs_trans_handle_cachep)
+                kmem_cache_destroy(btrfs_trans_handle_cachep);
+        if (btrfs_transaction_cachep)
+                kmem_cache_destroy(btrfs_transaction_cachep);
+        if (btrfs_bit_radix_cachep)
+                kmem_cache_destroy(btrfs_bit_radix_cachep);
+        if (btrfs_path_cachep)
+                kmem_cache_destroy(btrfs_path_cachep);
+}
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+                                       unsigned long extra_flags,
+                                       void (*ctor)(void *))
+{
+        return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
+                                 SLAB_MEM_SPREAD | extra_flags), ctor);
+}
+int btrfs_init_cachep(void)
+{
+        btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
+                                          sizeof(struct btrfs_inode),
+                                          0, init_once);
+        if (!btrfs_inode_cachep)
+                goto fail;
+        btrfs_trans_handle_cachep =
+                        btrfs_cache_create("btrfs_trans_handle_cache",
+                                           sizeof(struct btrfs_trans_handle),
+                                           0, NULL);
+        if (!btrfs_trans_handle_cachep)
+                goto fail;
+        btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
+                                             sizeof(struct btrfs_transaction),
+                                             0, NULL);
+        if (!btrfs_transaction_cachep)
+                goto fail;
+        btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
+                                         sizeof(struct btrfs_path),
+                                         0, NULL);
+        if (!btrfs_path_cachep)
+                goto fail;
+        btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
+                                              SLAB_DESTROY_BY_RCU, NULL);
+        if (!btrfs_bit_radix_cachep)
+                goto fail;
+        return 0;
+fail:
+        btrfs_destroy_cachep();
+        return -ENOMEM;
+}
+static int btrfs_getattr(struct vfsmount *mnt,
+                         struct dentry *dentry, struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        generic_fillattr(inode, stat);
+        stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
+        stat->blksize = PAGE_CACHE_SIZE;
+        stat->blocks = (inode_get_bytes(inode) +
+                        BTRFS_I(inode)->delalloc_bytes) >> 9;
+        return 0;
+}
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                           struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(old_dir)->root;
+        struct inode *new_inode = new_dentry->d_inode;
+        struct inode *old_inode = old_dentry->d_inode;
+        struct timespec ctime = CURRENT_TIME;
+        u64 index = 0;
+        int ret;
+        /* we're not allowed to rename between subvolumes */
+        if (BTRFS_I(old_inode)->root->root_key.objectid !=
+            BTRFS_I(new_dir)->root->root_key.objectid)
+                return -EXDEV;
+        if (S_ISDIR(old_inode->i_mode) && new_inode &&
+            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+                return -ENOTEMPTY;
+        }
+        /* to rename a snapshot or subvolume, we need to juggle the
+         * backrefs.  This isn't coded yet
+         */
+        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+                return -EXDEV;
+        ret = btrfs_check_free_space(root, 1, 0);
+        if (ret)
+                goto out_unlock;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, new_dir);
+        btrfs_inc_nlink(old_dentry->d_inode);
+        old_dir->i_ctime = old_dir->i_mtime = ctime;
+        new_dir->i_ctime = new_dir->i_mtime = ctime;
+        old_inode->i_ctime = ctime;
+        ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+                                 old_dentry->d_name.name,
+                                 old_dentry->d_name.len);
+        if (ret)
+                goto out_fail;
+        if (new_inode) {
+                new_inode->i_ctime = CURRENT_TIME;
+                ret = btrfs_unlink_inode(trans, root, new_dir,
+                                         new_dentry->d_inode,
+                                         new_dentry->d_name.name,
+                                         new_dentry->d_name.len);
+                if (ret)
+                        goto out_fail;
+                if (new_inode->i_nlink == 0) {
+                        ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+                        if (ret)
+                                goto out_fail;
+                }
+        }
+        ret = btrfs_set_inode_index(new_dir, &index);
+        if (ret)
+                goto out_fail;
+        ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
+                             old_inode, new_dentry->d_name.name,
+                             new_dentry->d_name.len, 1, index);
+        if (ret)
+                goto out_fail;
+out_fail:
+        btrfs_end_transaction_throttle(trans, root);
+out_unlock:
+        return ret;
+}
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+{
+        struct list_head *head = &root->fs_info->delalloc_inodes;
+        struct btrfs_inode *binode;
+        struct inode *inode;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        spin_lock(&root->fs_info->delalloc_lock);
+        while (!list_empty(head)) {
+                binode = list_entry(head->next, struct btrfs_inode,
+                                    delalloc_inodes);
+                inode = igrab(&binode->vfs_inode);
+                if (!inode)
+                        list_del_init(&binode->delalloc_inodes);
+                spin_unlock(&root->fs_info->delalloc_lock);
+                if (inode) {
+                        filemap_flush(inode->i_mapping);
+                        iput(inode);
+                }
+                cond_resched();
+                spin_lock(&root->fs_info->delalloc_lock);
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
+        /* the filemap_flush will queue IO into the worker threads, but
+         * we have to make sure the IO is actually started and that
+         * ordered extents get created before we return
+         */
+        atomic_inc(&root->fs_info->async_submit_draining);
+        while (atomic_read(&root->fs_info->nr_async_submits) ||
+              atomic_read(&root->fs_info->async_delalloc_pages)) {
+                wait_event(root->fs_info->async_submit_wait,
+                   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+        }
+        atomic_dec(&root->fs_info->async_submit_draining);
+        return 0;
+}
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+                         const char *symname)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct inode *inode = NULL;
+        int err;
+        int drop_inode = 0;
+        u64 objectid;
+        u64 index = 0 ;
+        int name_len;
+        int datasize;
+        unsigned long ptr;
+        struct btrfs_file_extent_item *ei;
+        struct extent_buffer *leaf;
+        unsigned long nr = 0;
+        name_len = strlen(symname) + 1;
+        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+                return -ENAMETOOLONG;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto out_fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+        if (err) {
+                err = -ENOSPC;
+                goto out_unlock;
+        }
+        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+                                dentry->d_name.len,
+                                dentry->d_parent->d_inode->i_ino, objectid,
+                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+                                &index);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_unlock;
+        err = btrfs_init_acl(inode, dir);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
+        btrfs_set_trans_block_group(trans, inode);
+        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        if (err)
+                drop_inode = 1;
+        else {
+                inode->i_mapping->a_ops = &btrfs_aops;
+                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+                inode->i_fop = &btrfs_file_operations;
+                inode->i_op = &btrfs_file_inode_operations;
+                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        }
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, inode);
+        btrfs_update_inode_block_group(trans, dir);
+        if (drop_inode)
+                goto out_unlock;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = inode->i_ino;
+        key.offset = 0;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+        datasize = btrfs_file_extent_calc_inline_size(name_len);
+        err = btrfs_insert_empty_item(trans, root, path, &key,
+                                      datasize);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+        btrfs_set_file_extent_type(leaf, ei,
+                                   BTRFS_FILE_EXTENT_INLINE);
+        btrfs_set_file_extent_encryption(leaf, ei, 0);
+        btrfs_set_file_extent_compression(leaf, ei, 0);
+        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+        btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+        ptr = btrfs_file_extent_inline_start(ei);
+        write_extent_buffer(leaf, symname, ptr, name_len);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        inode->i_op = &btrfs_symlink_inode_operations;
+        inode->i_mapping->a_ops = &btrfs_symlink_aops;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        inode_set_bytes(inode, name_len);
+        btrfs_i_size_write(inode, name_len - 1);
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
+                drop_inode = 1;
+out_unlock:
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+out_fail:
+        if (drop_inode) {
+                inode_dec_link_count(inode);
+                iput(inode);
+        }
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+                               u64 alloc_hint, int mode)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_key ins;
+        u64 alloc_size;
+        u64 cur_offset = start;
+        u64 num_bytes = end - start;
+        int ret = 0;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(!trans);
+        btrfs_set_trans_block_group(trans, inode);
+        while (num_bytes > 0) {
+                alloc_size = min(num_bytes, root->fs_info->max_extent);
+                ret = btrfs_reserve_extent(trans, root, alloc_size,
+                                           root->sectorsize, 0, alloc_hint,
+                                           (u64)-1, &ins, 1);
+                if (ret) {
+                        WARN_ON(1);
+                        goto out;
+                }
+                ret = insert_reserved_file_extent(trans, inode,
+                                                  cur_offset, ins.objectid,
+                                                  ins.offset, ins.offset,
+                                                  ins.offset, 0, 0, 0,
+                                                  BTRFS_FILE_EXTENT_PREALLOC);
+                BUG_ON(ret);
+                num_bytes -= ins.offset;
+                cur_offset += ins.offset;
+                alloc_hint = ins.objectid + ins.offset;
+        }
+out:
+        if (cur_offset > start) {
+                inode->i_ctime = CURRENT_TIME;
+                btrfs_set_flag(inode, PREALLOC);
+                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+                    cur_offset > i_size_read(inode))
+                        btrfs_i_size_write(inode, cur_offset);
+                ret = btrfs_update_inode(trans, root, inode);
+                BUG_ON(ret);
+        }
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
+static long btrfs_fallocate(struct inode *inode, int mode,
+                            loff_t offset, loff_t len)
+{
+        u64 cur_offset;
+        u64 last_byte;
+        u64 alloc_start;
+        u64 alloc_end;
+        u64 alloc_hint = 0;
+        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        struct extent_map *em;
+        int ret;
+        alloc_start = offset & ~mask;
+        alloc_end =  (offset + len + mask) & ~mask;
+        mutex_lock(&inode->i_mutex);
+        if (alloc_start > inode->i_size) {
+                ret = btrfs_cont_expand(inode, alloc_start);
+                if (ret)
+                        goto out;
+        }
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
+                            alloc_end - 1, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            alloc_end - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > alloc_start &&
+                    ordered->file_offset < alloc_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent(&BTRFS_I(inode)->io_tree,
+                                      alloc_start, alloc_end - 1, GFP_NOFS);
+                        btrfs_wait_ordered_range(inode, alloc_start,
+                                                 alloc_end - alloc_start);
+                } else {
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+        }
+        cur_offset = alloc_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                      alloc_end - cur_offset, 0);
+                BUG_ON(IS_ERR(em) || !em);
+                last_byte = min(extent_map_end(em), alloc_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (em->block_start == EXTENT_MAP_HOLE) {
+                        ret = prealloc_file_range(inode, cur_offset,
+                                        last_byte, alloc_hint, mode);
+                        if (ret < 0) {
+                                free_extent_map(em);
+                                break;
+                        }
+                }
+                if (em->block_start <= EXTENT_MAP_LAST_BYTE)
+                        alloc_hint = em->block_start;
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (cur_offset >= alloc_end) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+                      GFP_NOFS);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
+static int btrfs_set_page_dirty(struct page *page)
+{
+        return __set_page_dirty_nobuffers(page);
+}
+static int btrfs_permission(struct inode *inode, int mask)
+{
+        if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+                return -EACCES;
+        return generic_permission(inode, mask, btrfs_check_acl);
+}
+static struct inode_operations btrfs_dir_inode_operations = {
+        .getattr        = btrfs_getattr,
+        .lookup         = btrfs_lookup,
+        .create         = btrfs_create,
+        .unlink         = btrfs_unlink,
+        .link           = btrfs_link,
+        .mkdir          = btrfs_mkdir,
+        .rmdir          = btrfs_rmdir,
+        .rename         = btrfs_rename,
+        .symlink        = btrfs_symlink,
+        .setattr        = btrfs_setattr,
+        .mknod          = btrfs_mknod,
+        .setxattr       = btrfs_setxattr,
+        .getxattr       = btrfs_getxattr,
+        .listxattr      = btrfs_listxattr,
+        .removexattr    = btrfs_removexattr,
+        .permission     = btrfs_permission,
+};
+static struct inode_operations btrfs_dir_ro_inode_operations = {
+        .lookup         = btrfs_lookup,
+        .permission     = btrfs_permission,
+};
+static struct file_operations btrfs_dir_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = btrfs_real_readdir,
+        .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = btrfs_ioctl,
+#endif
+        .release        = btrfs_release_file,
+        .fsync          = btrfs_sync_file,
+};
+static struct extent_io_ops btrfs_extent_io_ops = {
+        .fill_delalloc = run_delalloc_range,
+        .submit_bio_hook = btrfs_submit_bio_hook,
+        .merge_bio_hook = btrfs_merge_bio_hook,
+        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
+        .writepage_end_io_hook = btrfs_writepage_end_io_hook,
+        .writepage_start_hook = btrfs_writepage_start_hook,
+        .readpage_io_failed_hook = btrfs_io_failed_hook,
+        .set_bit_hook = btrfs_set_bit_hook,
+        .clear_bit_hook = btrfs_clear_bit_hook,
+};
+static struct address_space_operations btrfs_aops = {
+        .readpage       = btrfs_readpage,
+        .writepage      = btrfs_writepage,
+        .writepages     = btrfs_writepages,
+        .readpages      = btrfs_readpages,
+        .sync_page      = block_sync_page,
+        .bmap           = btrfs_bmap,
+        .direct_IO      = btrfs_direct_IO,
+        .invalidatepage = btrfs_invalidatepage,
+        .releasepage    = btrfs_releasepage,
+        .set_page_dirty = btrfs_set_page_dirty,
+};
+static struct address_space_operations btrfs_symlink_aops = {
+        .readpage       = btrfs_readpage,
+        .writepage      = btrfs_writepage,
+        .invalidatepage = btrfs_invalidatepage,
+        .releasepage    = btrfs_releasepage,
+};
+static struct inode_operations btrfs_file_inode_operations = {
+        .truncate       = btrfs_truncate,
+        .getattr        = btrfs_getattr,
+        .setattr        = btrfs_setattr,
+        .setxattr       = btrfs_setxattr,
+        .getxattr       = btrfs_getxattr,
+        .listxattr      = btrfs_listxattr,
+        .removexattr    = btrfs_removexattr,
+        .permission     = btrfs_permission,
+        .fallocate      = btrfs_fallocate,
+};
+static struct inode_operations btrfs_special_inode_operations = {
+        .getattr        = btrfs_getattr,
+        .setattr        = btrfs_setattr,
+        .permission     = btrfs_permission,
+        .setxattr       = btrfs_setxattr,
+        .getxattr       = btrfs_getxattr,
+        .listxattr      = btrfs_listxattr,
+        .removexattr    = btrfs_removexattr,
+};
+static struct inode_operations btrfs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .permission     = btrfs_permission,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..c2aa33e3feb5
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/security.h>
+#include <linux/version.h>
+#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "locking.h"
+static noinline int create_subvol(struct btrfs_root *root,
+                                  struct dentry *dentry,
+                                  char *name, int namelen)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key key;
+        struct btrfs_root_item root_item;
+        struct btrfs_inode_item *inode_item;
+        struct extent_buffer *leaf;
+        struct btrfs_root *new_root = root;
+        struct inode *dir;
+        int ret;
+        int err;
+        u64 objectid;
+        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+        u64 index = 0;
+        unsigned long nr = 1;
+        ret = btrfs_check_free_space(root, 1, 0);
+        if (ret)
+                goto fail_commit;
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+                                       0, &objectid);
+        if (ret)
+                goto fail;
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+                                      objectid, trans->transid, 0, 0, 0);
+        if (IS_ERR(leaf)) {
+                ret = PTR_ERR(leaf);
+                goto fail;
+        }
+        btrfs_set_header_nritems(leaf, 0);
+        btrfs_set_header_level(leaf, 0);
+        btrfs_set_header_bytenr(leaf, leaf->start);
+        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_owner(leaf, objectid);
+        write_extent_buffer(leaf, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(leaf),
+                            BTRFS_FSID_SIZE);
+        btrfs_mark_buffer_dirty(leaf);
+        inode_item = &root_item.inode;
+        memset(inode_item, 0, sizeof(*inode_item));
+        inode_item->generation = cpu_to_le64(1);
+        inode_item->size = cpu_to_le64(3);
+        inode_item->nlink = cpu_to_le32(1);
+        inode_item->nbytes = cpu_to_le64(root->leafsize);
+        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+        btrfs_set_root_bytenr(&root_item, leaf->start);
+        btrfs_set_root_generation(&root_item, trans->transid);
+        btrfs_set_root_level(&root_item, 0);
+        btrfs_set_root_refs(&root_item, 1);
+        btrfs_set_root_used(&root_item, 0);
+        btrfs_set_root_last_snapshot(&root_item, 0);
+        memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+        root_item.drop_level = 0;
+        btrfs_tree_unlock(leaf);
+        free_extent_buffer(leaf);
+        leaf = NULL;
+        btrfs_set_root_dirid(&root_item, new_dirid);
+        key.objectid = objectid;
+        key.offset = 1;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+                                &root_item);
+        if (ret)
+                goto fail;
+        /*
+         * insert the directory item
+         */
+        key.offset = (u64)-1;
+        dir = dentry->d_parent->d_inode;
+        ret = btrfs_set_inode_index(dir, &index);
+        BUG_ON(ret);
+        ret = btrfs_insert_dir_item(trans, root,
+                                    name, namelen, dir->i_ino, &key,
+                                    BTRFS_FT_DIR, index);
+        if (ret)
+                goto fail;
+        btrfs_i_size_write(dir, dir->i_size + namelen * 2);
+        ret = btrfs_update_inode(trans, root, dir);
+        BUG_ON(ret);
+        /* add the backref first */
+        ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+                                 objectid, BTRFS_ROOT_BACKREF_KEY,
+                                 root->root_key.objectid,
+                                 dir->i_ino, index, name, namelen);
+        BUG_ON(ret);
+        /* now add the forward ref */
+        ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+                                 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
+                                 objectid,
+                                 dir->i_ino, index, name, namelen);
+        BUG_ON(ret);
+        ret = btrfs_commit_transaction(trans, root);
+        if (ret)
+                goto fail_commit;
+        new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+        BUG_ON(!new_root);
+        trans = btrfs_start_transaction(new_root, 1);
+        BUG_ON(!trans);
+        ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
+                                       BTRFS_I(dir)->block_group);
+        if (ret)
+                goto fail;
+fail:
+        nr = trans->blocks_used;
+        err = btrfs_commit_transaction(trans, new_root);
+        if (err && !ret)
+                ret = err;
+fail_commit:
+        btrfs_btree_balance_dirty(root, nr);
+        return ret;
+}
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+                           char *name, int namelen)
+{
+        struct btrfs_pending_snapshot *pending_snapshot;
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        int err;
+        unsigned long nr = 0;
+        if (!root->ref_cows)
+                return -EINVAL;
+        ret = btrfs_check_free_space(root, 1, 0);
+        if (ret)
+                goto fail_unlock;
+        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+        if (!pending_snapshot) {
+                ret = -ENOMEM;
+                goto fail_unlock;
+        }
+        pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+        if (!pending_snapshot->name) {
+                ret = -ENOMEM;
+                kfree(pending_snapshot);
+                goto fail_unlock;
+        }
+        memcpy(pending_snapshot->name, name, namelen);
+        pending_snapshot->name[namelen] = '\0';
+        pending_snapshot->dentry = dentry;
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        pending_snapshot->root = root;
+        list_add(&pending_snapshot->list,
+                 &trans->transaction->pending_snapshots);
+        err = btrfs_commit_transaction(trans, root);
+fail_unlock:
+        btrfs_btree_balance_dirty(root, nr);
+        return ret;
+}
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+{
+        if (child->d_inode)
+                return -EEXIST;
+        if (IS_DEADDIR(dir))
+                return -ENOENT;
+        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+/*
+ * Create a new subvolume below @parent.  This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(struct path *parent, char *name,
+                                   int mode, int namelen,
+                                   struct btrfs_root *snap_src)
+{
+        struct dentry *dentry;
+        int error;
+        mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+        dentry = lookup_one_len(name, parent->dentry, namelen);
+        error = PTR_ERR(dentry);
+        if (IS_ERR(dentry))
+                goto out_unlock;
+        error = -EEXIST;
+        if (dentry->d_inode)
+                goto out_dput;
+        if (!IS_POSIXACL(parent->dentry->d_inode))
+                mode &= ~current->fs->umask;
+        error = mnt_want_write(parent->mnt);
+        if (error)
+                goto out_dput;
+        error = btrfs_may_create(parent->dentry->d_inode, dentry);
+        if (error)
+                goto out_drop_write;
+        /*
+         * Actually perform the low-level subvolume creation after all
+         * this VFS fuzz.
+         *
+         * Eventually we want to pass in an inode under which we create this
+         * subvolume, but for now all are under the filesystem root.
+         *
+         * Also we should pass on the mode eventually to allow creating new
+         * subvolume with specific mode bits.
+         */
+        if (snap_src) {
+                struct dentry *dir = dentry->d_parent;
+                struct dentry *test = dir->d_parent;
+                struct btrfs_path *path = btrfs_alloc_path();
+                int ret;
+                u64 test_oid;
+                u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
+                test_oid = snap_src->root_key.objectid;
+                ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+                                          path, parent_oid, test_oid);
+                if (ret == 0)
+                        goto create;
+                btrfs_release_path(snap_src->fs_info->tree_root, path);
+                /* we need to make sure we aren't creating a directory loop
+                 * by taking a snapshot of something that has our current
+                 * subvol in its directory tree.  So, this loops through
+                 * the dentries and checks the forward refs for each subvolume
+                 * to see if is references the subvolume where we are
+                 * placing this new snapshot.
+                 */
+                while (1) {
+                        if (!test ||
+                            dir == snap_src->fs_info->sb->s_root ||
+                            test == snap_src->fs_info->sb->s_root ||
+                            test->d_inode->i_sb != snap_src->fs_info->sb) {
+                                break;
+                        }
+                        if (S_ISLNK(test->d_inode->i_mode)) {
+                                printk(KERN_INFO "Btrfs symlink in snapshot "
+                                       "path, failed\n");
+                                error = -EMLINK;
+                                btrfs_free_path(path);
+                                goto out_drop_write;
+                        }
+                        test_oid =
+                                BTRFS_I(test->d_inode)->root->root_key.objectid;
+                        ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+                                  path, test_oid, parent_oid);
+                        if (ret == 0) {
+                                printk(KERN_INFO "Btrfs snapshot creation "
+                                       "failed, looping\n");
+                                error = -EMLINK;
+                                btrfs_free_path(path);
+                                goto out_drop_write;
+                        }
+                        btrfs_release_path(snap_src->fs_info->tree_root, path);
+                        test = test->d_parent;
+                }
+create:
+                btrfs_free_path(path);
+                error = create_snapshot(snap_src, dentry, name, namelen);
+        } else {
+                error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
+                                      dentry, name, namelen);
+        }
+        if (error)
+                goto out_drop_write;
+        fsnotify_mkdir(parent->dentry->d_inode, dentry);
+out_drop_write:
+        mnt_drop_write(parent->mnt);
+out_dput:
+        dput(dentry);
+out_unlock:
+        mutex_unlock(&parent->dentry->d_inode->i_mutex);
+        return error;
+}
+static int btrfs_defrag_file(struct file *file)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_ordered_extent *ordered;
+        struct page *page;
+        unsigned long last_index;
+        unsigned long ra_pages = root->fs_info->bdi.ra_pages;
+        unsigned long total_read = 0;
+        u64 page_start;
+        u64 page_end;
+        unsigned long i;
+        int ret;
+        ret = btrfs_check_free_space(root, inode->i_size, 0);
+        if (ret)
+                return -ENOSPC;
+        mutex_lock(&inode->i_mutex);
+        last_index = inode->i_size >> PAGE_CACHE_SHIFT;
+        for (i = 0; i <= last_index; i++) {
+                if (total_read % ra_pages == 0) {
+                        btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
+                                       min(last_index, i + ra_pages - 1));
+                }
+                total_read++;
+again:
+                page = grab_cache_page(inode->i_mapping, i);
+                if (!page)
+                        goto out_unlock;
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto out_unlock;
+                        }
+                }
+                wait_on_page_writeback(page);
+                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+                page_end = page_start + PAGE_CACHE_SIZE - 1;
+                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, page_start);
+                if (ordered) {
+                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                        goto again;
+                }
+                set_page_extent_mapped(page);
+                /*
+                 * this makes sure page_mkwrite is called on the
+                 * page if it is dirtied again later
+                 */
+                clear_page_dirty_for_io(page);
+                btrfs_set_extent_delalloc(inode, page_start, page_end);
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                set_page_dirty(page);
+                unlock_page(page);
+                page_cache_release(page);
+                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+        }
+out_unlock:
+        mutex_unlock(&inode->i_mutex);
+        return 0;
+}
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+{
+        u64 new_size;
+        u64 old_size;
+        u64 devid = 1;
+        struct btrfs_ioctl_vol_args *vol_args;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_device *device = NULL;
+        char *sizestr;
+        char *devstr = NULL;
+        int ret = 0;
+        int namelen;
+        int mod = 0;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        if (!vol_args)
+                return -ENOMEM;
+        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        namelen = strlen(vol_args->name);
+        mutex_lock(&root->fs_info->volume_mutex);
+        sizestr = vol_args->name;
+        devstr = strchr(sizestr, ':');
+        if (devstr) {
+                char *end;
+                sizestr = devstr + 1;
+                *devstr = '\0';
+                devstr = vol_args->name;
+                devid = simple_strtoull(devstr, &end, 10);
+                printk(KERN_INFO "resizing devid %llu\n", devid);
+        }
+        device = btrfs_find_device(root, devid, NULL, NULL);
+        if (!device) {
+                printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        if (!strcmp(sizestr, "max"))
+                new_size = device->bdev->bd_inode->i_size;
+        else {
+                if (sizestr[0] == '-') {
+                        mod = -1;
+                        sizestr++;
+                } else if (sizestr[0] == '+') {
+                        mod = 1;
+                        sizestr++;
+                }
+                new_size = btrfs_parse_size(sizestr);
+                if (new_size == 0) {
+                        ret = -EINVAL;
+                        goto out_unlock;
+                }
+        }
+        old_size = device->total_bytes;
+        if (mod < 0) {
+                if (new_size > old_size) {
+                        ret = -EINVAL;
+                        goto out_unlock;
+                }
+                new_size = old_size - new_size;
+        } else if (mod > 0) {
+                new_size = old_size + new_size;
+        }
+        if (new_size < 256 * 1024 * 1024) {
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        if (new_size > device->bdev->bd_inode->i_size) {
+                ret = -EFBIG;
+                goto out_unlock;
+        }
+        do_div(new_size, root->sectorsize);
+        new_size *= root->sectorsize;
+        printk(KERN_INFO "new size for %s is %llu\n",
+                device->name, (unsigned long long)new_size);
+        if (new_size > old_size) {
+                trans = btrfs_start_transaction(root, 1);
+                ret = btrfs_grow_device(trans, device, new_size);
+                btrfs_commit_transaction(trans, root);
+        } else {
+                ret = btrfs_shrink_device(device, new_size);
+        }
+out_unlock:
+        mutex_unlock(&root->fs_info->volume_mutex);
+out:
+        kfree(vol_args);
+        return ret;
+}
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+                                            void __user *arg, int subvol)
+{
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        struct btrfs_ioctl_vol_args *vol_args;
+        struct btrfs_dir_item *di;
+        struct btrfs_path *path;
+        struct file *src_file;
+        u64 root_dirid;
+        int namelen;
+        int ret = 0;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        if (!vol_args)
+                return -ENOMEM;
+        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        namelen = strlen(vol_args->name);
+        if (strchr(vol_args->name, '/')) {
+                ret = -EINVAL;
+                goto out;
+        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+        di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+                            path, root_dirid,
+                            vol_args->name, namelen, 0);
+        btrfs_free_path(path);
+        if (di && !IS_ERR(di)) {
+                ret = -EEXIST;
+                goto out;
+        }
+        if (IS_ERR(di)) {
+                ret = PTR_ERR(di);
+                goto out;
+        }
+        if (subvol) {
+                ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+                                     file->f_path.dentry->d_inode->i_mode,
+                                     namelen, NULL);
+        } else {
+                struct inode *src_inode;
+                src_file = fget(vol_args->fd);
+                if (!src_file) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                src_inode = src_file->f_path.dentry->d_inode;
+                if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
+                        printk(KERN_INFO "btrfs: Snapshot src from "
+                               "another FS\n");
+                        ret = -EINVAL;
+                        fput(src_file);
+                        goto out;
+                }
+                ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+                             file->f_path.dentry->d_inode->i_mode,
+                             namelen, BTRFS_I(src_inode)->root);
+                fput(src_file);
+        }
+out:
+        kfree(vol_args);
+        return ret;
+}
+static int btrfs_ioctl_defrag(struct file *file)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        ret = mnt_want_write(file->f_path.mnt);
+        if (ret)
+                return ret;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFDIR:
+                if (!capable(CAP_SYS_ADMIN)) {
+                        ret = -EPERM;
+                        goto out;
+                }
+                btrfs_defrag_root(root, 0);
+                btrfs_defrag_root(root->fs_info->extent_root, 0);
+                break;
+        case S_IFREG:
+                if (!(file->f_mode & FMODE_WRITE)) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                btrfs_defrag_file(file);
+                break;
+        }
+out:
+        mnt_drop_write(file->f_path.mnt);
+        return ret;
+}
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_vol_args *vol_args;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        if (!vol_args)
+                return -ENOMEM;
+        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        ret = btrfs_init_new_device(root, vol_args->name);
+out:
+        kfree(vol_args);
+        return ret;
+}
+static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_vol_args *vol_args;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        if (!vol_args)
+                return -ENOMEM;
+        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        ret = btrfs_rm_device(root, vol_args->name);
+out:
+        kfree(vol_args);
+        return ret;
+}
+static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+                u64 off, u64 olen, u64 destoff)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct file *src_file;
+        struct inode *src;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        char *buf;
+        struct btrfs_key key;
+        u32 nritems;
+        int slot;
+        int ret;
+        u64 len = olen;
+        u64 bs = root->fs_info->sb->s_blocksize;
+        u64 hint_byte;
+        /*
+         * TODO:
+         * - split compressed inline extents.  annoying: we need to
+         *   decompress into destination's address_space (the file offset
+         *   may change, so source mapping won't do), then recompress (or
+         *   otherwise reinsert) a subrange.
+         * - allow ranges within the same file to be cloned (provided
+         *   they don't overlap)?
+         */
+        /* the destination must be opened for writing */
+        if (!(file->f_mode & FMODE_WRITE))
+                return -EINVAL;
+        ret = mnt_want_write(file->f_path.mnt);
+        if (ret)
+                return ret;
+        src_file = fget(srcfd);
+        if (!src_file) {
+                ret = -EBADF;
+                goto out_drop_write;
+        }
+        src = src_file->f_dentry->d_inode;
+        ret = -EINVAL;
+        if (src == inode)
+                goto out_fput;
+        ret = -EISDIR;
+        if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+                goto out_fput;
+        ret = -EXDEV;
+        if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
+                goto out_fput;
+        ret = -ENOMEM;
+        buf = vmalloc(btrfs_level_size(root, 0));
+        if (!buf)
+                goto out_fput;
+        path = btrfs_alloc_path();
+        if (!path) {
+                vfree(buf);
+                goto out_fput;
+        }
+        path->reada = 2;
+        if (inode < src) {
+                mutex_lock(&inode->i_mutex);
+                mutex_lock(&src->i_mutex);
+        } else {
+                mutex_lock(&src->i_mutex);
+                mutex_lock(&inode->i_mutex);
+        }
+        /* determine range to clone */
+        ret = -EINVAL;
+        if (off >= src->i_size || off + len > src->i_size)
+                goto out_unlock;
+        if (len == 0)
+                olen = len = src->i_size - off;
+        /* if we extend to eof, continue to block boundary */
+        if (off + len == src->i_size)
+                len = ((src->i_size + bs-1) & ~(bs-1))
+                        - off;
+        /* verify the end result is block aligned */
+        if ((off & (bs-1)) ||
+            ((off + len) & (bs-1)))
+                goto out_unlock;
+        /* do any pending delalloc/csum calc on src, one way or
+           another, and lock file content */
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
+                if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+                        break;
+                unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+                if (ordered)
+                        btrfs_put_ordered_extent(ordered);
+                btrfs_wait_ordered_range(src, off, off+len);
+        }
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        /* punch hole in destination first */
+        btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+        /* clone data */
+        key.objectid = src->i_ino;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = 0;
+        while (1) {
+                /*
+                 * note the key will change type as we walk through the
+                 * tree.
+                 */
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0)
+                                break;
+                        nritems = btrfs_header_nritems(path->nodes[0]);
+                }
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+                    key.objectid != src->i_ino)
+                        break;
+                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+                        struct btrfs_file_extent_item *extent;
+                        int type;
+                        u32 size;
+                        struct btrfs_key new_key;
+                        u64 disko = 0, diskl = 0;
+                        u64 datao = 0, datal = 0;
+                        u8 comp;
+                        size = btrfs_item_size_nr(leaf, slot);
+                        read_extent_buffer(leaf, buf,
+                                           btrfs_item_ptr_offset(leaf, slot),
+                                           size);
+                        extent = btrfs_item_ptr(leaf, slot,
+                                                struct btrfs_file_extent_item);
+                        comp = btrfs_file_extent_compression(leaf, extent);
+                        type = btrfs_file_extent_type(leaf, extent);
+                        if (type == BTRFS_FILE_EXTENT_REG) {
+                                disko = btrfs_file_extent_disk_bytenr(leaf,
+                                                                      extent);
+                                diskl = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                 extent);
+                                datao = btrfs_file_extent_offset(leaf, extent);
+                                datal = btrfs_file_extent_num_bytes(leaf,
+                                                                    extent);
+                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+                                /* take upper bound, may be compressed */
+                                datal = btrfs_file_extent_ram_bytes(leaf,
+                                                                    extent);
+                        }
+                        btrfs_release_path(root, path);
+                        if (key.offset + datal < off ||
+                            key.offset >= off+len)
+                                goto next;
+                        memcpy(&new_key, &key, sizeof(new_key));
+                        new_key.objectid = inode->i_ino;
+                        new_key.offset = key.offset + destoff - off;
+                        if (type == BTRFS_FILE_EXTENT_REG) {
+                                ret = btrfs_insert_empty_item(trans, root, path,
+                                                              &new_key, size);
+                                if (ret)
+                                        goto out;
+                                leaf = path->nodes[0];
+                                slot = path->slots[0];
+                                write_extent_buffer(leaf, buf,
+                                            btrfs_item_ptr_offset(leaf, slot),
+                                            size);
+                                extent = btrfs_item_ptr(leaf, slot,
+                                                struct btrfs_file_extent_item);
+                                if (off > key.offset) {
+                                        datao += off - key.offset;
+                                        datal -= off - key.offset;
+                                }
+                                if (key.offset + datao + datal + key.offset >
+                                    off + len)
+                                        datal = off + len - key.offset - datao;
+                                /* disko == 0 means it's a hole */
+                                if (!disko)
+                                        datao = 0;
+                                btrfs_set_file_extent_offset(leaf, extent,
+                                                             datao);
+                                btrfs_set_file_extent_num_bytes(leaf, extent,
+                                                                datal);
+                                if (disko) {
+                                        inode_add_bytes(inode, datal);
+                                        ret = btrfs_inc_extent_ref(trans, root,
+                                                   disko, diskl, leaf->start,
+                                                   root->root_key.objectid,
+                                                   trans->transid,
+                                                   inode->i_ino);
+                                        BUG_ON(ret);
+                                }
+                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+                                u64 skip = 0;
+                                u64 trim = 0;
+                                if (off > key.offset) {
+                                        skip = off - key.offset;
+                                        new_key.offset += skip;
+                                }
+                                if (key.offset + datal > off+len)
+                                        trim = key.offset + datal - (off+len);
+                                if (comp && (skip || trim)) {
+                                        ret = -EINVAL;
+                                        goto out;
+                                }
+                                size -= skip + trim;
+                                datal -= skip + trim;
+                                ret = btrfs_insert_empty_item(trans, root, path,
+                                                              &new_key, size);
+                                if (ret)
+                                        goto out;
+                                if (skip) {
+                                        u32 start =
+                                          btrfs_file_extent_calc_inline_size(0);
+                                        memmove(buf+start, buf+start+skip,
+                                                datal);
+                                }
+                                leaf = path->nodes[0];
+                                slot = path->slots[0];
+                                write_extent_buffer(leaf, buf,
+                                            btrfs_item_ptr_offset(leaf, slot),
+                                            size);
+                                inode_add_bytes(inode, datal);
+                        }
+                        btrfs_mark_buffer_dirty(leaf);
+                }
+next:
+                btrfs_release_path(root, path);
+                key.offset++;
+        }
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        if (ret == 0) {
+                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                if (destoff + olen > inode->i_size)
+                        btrfs_i_size_write(inode, destoff + olen);
+                BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+                ret = btrfs_update_inode(trans, root, inode);
+        }
+        btrfs_end_transaction(trans, root);
+        unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+        if (ret)
+                vmtruncate(inode, 0);
+out_unlock:
+        mutex_unlock(&src->i_mutex);
+        mutex_unlock(&inode->i_mutex);
+        vfree(buf);
+        btrfs_free_path(path);
+out_fput:
+        fput(src_file);
+out_drop_write:
+        mnt_drop_write(file->f_path.mnt);
+        return ret;
+}
+static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+{
+        struct btrfs_ioctl_clone_range_args args;
+        if (copy_from_user(&args, argp, sizeof(args)))
+                return -EFAULT;
+        return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
+                                 args.src_length, args.dest_offset);
+}
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+static long btrfs_ioctl_trans_start(struct file *file)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (file->private_data) {
+                ret = -EINPROGRESS;
+                goto out;
+        }
+        ret = mnt_want_write(file->f_path.mnt);
+        if (ret)
+                goto out;
+        mutex_lock(&root->fs_info->trans_mutex);
+        root->fs_info->open_ioctl_trans++;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        trans = btrfs_start_ioctl_transaction(root, 0);
+        if (trans)
+                file->private_data = trans;
+        else
+                ret = -ENOMEM;
+        /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+out:
+        return ret;
+}
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        trans = file->private_data;
+        if (!trans) {
+                ret = -EINVAL;
+                goto out;
+        }
+        btrfs_end_transaction(trans, root);
+        file->private_data = NULL;
+        mutex_lock(&root->fs_info->trans_mutex);
+        root->fs_info->open_ioctl_trans--;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        mnt_drop_write(file->f_path.mnt);
+out:
+        return ret;
+}
+long btrfs_ioctl(struct file *file, unsigned int
+                cmd, unsigned long arg)
+{
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        void __user *argp = (void __user *)arg;
+        switch (cmd) {
+        case BTRFS_IOC_SNAP_CREATE:
+                return btrfs_ioctl_snap_create(file, argp, 0);
+        case BTRFS_IOC_SUBVOL_CREATE:
+                return btrfs_ioctl_snap_create(file, argp, 1);
+        case BTRFS_IOC_DEFRAG:
+                return btrfs_ioctl_defrag(file);
+        case BTRFS_IOC_RESIZE:
+                return btrfs_ioctl_resize(root, argp);
+        case BTRFS_IOC_ADD_DEV:
+                return btrfs_ioctl_add_dev(root, argp);
+        case BTRFS_IOC_RM_DEV:
+                return btrfs_ioctl_rm_dev(root, argp);
+        case BTRFS_IOC_BALANCE:
+                return btrfs_balance(root->fs_info->dev_root);
+        case BTRFS_IOC_CLONE:
+                return btrfs_ioctl_clone(file, arg, 0, 0, 0);
+        case BTRFS_IOC_CLONE_RANGE:
+                return btrfs_ioctl_clone_range(file, argp);
+        case BTRFS_IOC_TRANS_START:
+                return btrfs_ioctl_trans_start(file);
+        case BTRFS_IOC_TRANS_END:
+                return btrfs_ioctl_trans_end(file);
+        case BTRFS_IOC_SYNC:
+                btrfs_sync_fs(file->f_dentry->d_sb, 1);
+                return 0;
+        }
+        return -ENOTTY;
+}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..b320b103fa13
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __IOCTL_
+#define __IOCTL_
+#include <linux/ioctl.h>
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+#define BTRFS_PATH_NAME_MAX 4087
+/* this should be 4k */
+struct btrfs_ioctl_vol_args {
+        __s64 fd;
+        char name[BTRFS_PATH_NAME_MAX + 1];
+};
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+                                   struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+                                  struct btrfs_ioctl_clone_range_args)
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+                                   struct btrfs_ioctl_vol_args)
+#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..39bae7761db6
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <asm/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+/*
+ * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
+ * and the spin is not tuned very extensively.  The spinning does make a big
+ * difference in almost every workload, but spinning for the right amount of
+ * time needs some help.
+ *
+ * In general, we want to spin as long as the lock holder is doing btree
+ * searches, and we should give up if they are in more expensive code.
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+        int i;
+        if (mutex_trylock(&eb->mutex))
+                return 0;
+        for (i = 0; i < 512; i++) {
+                cpu_relax();
+                if (mutex_trylock(&eb->mutex))
+                        return 0;
+        }
+        cpu_relax();
+        mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+        return 0;
+}
+int btrfs_try_tree_lock(struct extent_buffer *eb)
+{
+        return mutex_trylock(&eb->mutex);
+}
+int btrfs_tree_unlock(struct extent_buffer *eb)
+{
+        mutex_unlock(&eb->mutex);
+        return 0;
+}
+int btrfs_tree_locked(struct extent_buffer *eb)
+{
+        return mutex_is_locked(&eb->mutex);
+}
+/*
+ * btrfs_search_slot uses this to decide if it should drop its locks
+ * before doing something expensive like allocating free blocks for cow.
+ */
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
+{
+        int i;
+        struct extent_buffer *eb;
+        for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
+                eb = path->nodes[i];
+                if (!eb)
+                        break;
+                smp_mb();
+                if (!list_empty(&eb->mutex.wait_list))
+                        return 1;
+        }
+        return 0;
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+int btrfs_tree_lock(struct extent_buffer *eb);
+int btrfs_tree_unlock(struct extent_buffer *eb);
+int btrfs_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..a20940170274
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "extent_io.h"
+static u64 entry_end(struct btrfs_ordered_extent *entry)
+{
+        if (entry->file_offset + entry->len < entry->file_offset)
+                return (u64)-1;
+        return entry->file_offset + entry->len;
+}
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct btrfs_ordered_extent *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
+                if (file_offset < entry->file_offset)
+                        p = &(*p)->rb_left;
+                else if (file_offset >= entry_end(entry))
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+                                     struct rb_node **prev_ret)
+{
+        struct rb_node *n = root->rb_node;
+        struct rb_node *prev = NULL;
+        struct rb_node *test;
+        struct btrfs_ordered_extent *entry;
+        struct btrfs_ordered_extent *prev_entry = NULL;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+                prev = n;
+                prev_entry = entry;
+                if (file_offset < entry->file_offset)
+                        n = n->rb_left;
+                else if (file_offset >= entry_end(entry))
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        if (!prev_ret)
+                return NULL;
+        while (prev && file_offset >= entry_end(prev_entry)) {
+                test = rb_next(prev);
+                if (!test)
+                        break;
+                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+                                      rb_node);
+                if (file_offset < entry_end(prev_entry))
+                        break;
+                prev = test;
+        }
+        if (prev)
+                prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+                                      rb_node);
+        while (prev && file_offset < entry_end(prev_entry)) {
+                test = rb_prev(prev);
+                if (!test)
+                        break;
+                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+                                      rb_node);
+                prev = test;
+        }
+        *prev_ret = prev;
+        return NULL;
+}
+/*
+ * helper to check if a given offset is inside a given entry
+ */
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+        if (file_offset < entry->file_offset ||
+            entry->file_offset + entry->len <= file_offset)
+                return 0;
+        return 1;
+}
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+                                          u64 file_offset)
+{
+        struct rb_root *root = &tree->tree;
+        struct rb_node *prev;
+        struct rb_node *ret;
+        struct btrfs_ordered_extent *entry;
+        if (tree->last) {
+                entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+                                 rb_node);
+                if (offset_in_entry(entry, file_offset))
+                        return tree->last;
+        }
+        ret = __tree_search(root, file_offset, &prev);
+        if (!ret)
+                ret = prev;
+        if (ret)
+                tree->last = ret;
+        return ret;
+}
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * file_offset is the logical offset in the file
+ *
+ * start is the disk block number of an extent already reserved in the
+ * extent allocation tree
+ *
+ * len is the length of the extent
+ *
+ * This also sets the EXTENT_ORDERED bit on the range in the inode.
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+                             u64 start, u64 len, u64 disk_len, int type)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        entry = kzalloc(sizeof(*entry), GFP_NOFS);
+        if (!entry)
+                return -ENOMEM;
+        mutex_lock(&tree->mutex);
+        entry->file_offset = file_offset;
+        entry->start = start;
+        entry->len = len;
+        entry->disk_len = disk_len;
+        entry->inode = inode;
+        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+                set_bit(type, &entry->flags);
+        /* one ref for the tree */
+        atomic_set(&entry->refs, 1);
+        init_waitqueue_head(&entry->wait);
+        INIT_LIST_HEAD(&entry->list);
+        INIT_LIST_HEAD(&entry->root_extent_list);
+        node = tree_insert(&tree->tree, file_offset,
+                           &entry->rb_node);
+        BUG_ON(node);
+        set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+                           entry_end(entry) - 1, GFP_NOFS);
+        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        list_add_tail(&entry->root_extent_list,
+                      &BTRFS_I(inode)->root->fs_info->ordered_extents);
+        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        mutex_unlock(&tree->mutex);
+        BUG_ON(node);
+        return 0;
+}
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished.  If the list covers more than one
+ * ordered extent, it is split across multiples.
+ */
+int btrfs_add_ordered_sum(struct inode *inode,
+                          struct btrfs_ordered_extent *entry,
+                          struct btrfs_ordered_sum *sum)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        list_add_tail(&sum->list, &entry->list);
+        mutex_unlock(&tree->mutex);
+        return 0;
+}
+/*
+ * this is used to account for finished IO across a given range
+ * of the file.  The IO should not span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ */
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+                                   u64 file_offset, u64 io_size)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int ret;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+                             GFP_NOFS);
+        node = tree_search(tree, file_offset);
+        if (!node) {
+                ret = 1;
+                goto out;
+        }
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        if (!offset_in_entry(entry, file_offset)) {
+                ret = 1;
+                goto out;
+        }
+        ret = test_range_bit(io_tree, entry->file_offset,
+                             entry->file_offset + entry->len - 1,
+                             EXTENT_ORDERED, 0);
+        if (ret == 0)
+                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+out:
+        mutex_unlock(&tree->mutex);
+        return ret == 0;
+}
+/*
+ * used to drop a reference on an ordered extent.  This will free
+ * the extent if the last reference is dropped
+ */
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+        struct list_head *cur;
+        struct btrfs_ordered_sum *sum;
+        if (atomic_dec_and_test(&entry->refs)) {
+                while (!list_empty(&entry->list)) {
+                        cur = entry->list.next;
+                        sum = list_entry(cur, struct btrfs_ordered_sum, list);
+                        list_del(&sum->list);
+                        kfree(sum);
+                }
+                kfree(entry);
+        }
+        return 0;
+}
+/*
+ * remove an ordered extent from the tree.  No references are dropped
+ * but, anyone waiting on this extent is woken up.
+ */
+int btrfs_remove_ordered_extent(struct inode *inode,
+                                struct btrfs_ordered_extent *entry)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        node = &entry->rb_node;
+        rb_erase(node, &tree->tree);
+        tree->last = NULL;
+        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        list_del_init(&entry->root_extent_list);
+        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        mutex_unlock(&tree->mutex);
+        wake_up(&entry->wait);
+        return 0;
+}
+/*
+ * wait for all the ordered extents in a root.  This is done when balancing
+ * space between drives.
+ */
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
+{
+        struct list_head splice;
+        struct list_head *cur;
+        struct btrfs_ordered_extent *ordered;
+        struct inode *inode;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        list_splice_init(&root->fs_info->ordered_extents, &splice);
+        while (!list_empty(&splice)) {
+                cur = splice.next;
+                ordered = list_entry(cur, struct btrfs_ordered_extent,
+                                     root_extent_list);
+                if (nocow_only &&
+                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
+                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+                        list_move(&ordered->root_extent_list,
+                                  &root->fs_info->ordered_extents);
+                        cond_resched_lock(&root->fs_info->ordered_extent_lock);
+                        continue;
+                }
+                list_del_init(&ordered->root_extent_list);
+                atomic_inc(&ordered->refs);
+                /*
+                 * the inode may be getting freed (in sys_unlink path).
+                 */
+                inode = igrab(ordered->inode);
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+                if (inode) {
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                        iput(inode);
+                } else {
+                        btrfs_put_ordered_extent(ordered);
+                }
+                spin_lock(&root->fs_info->ordered_extent_lock);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        return 0;
+}
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct inode *inode,
+                                       struct btrfs_ordered_extent *entry,
+                                       int wait)
+{
+        u64 start = entry->file_offset;
+        u64 end = start + entry->len - 1;
+        /*
+         * pages in the range can be dirty, clean or writeback.  We
+         * start IO on any dirty ones so the wait doesn't stall waiting
+         * for pdflush to find them
+         */
+        btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
+        if (wait) {
+                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+                                                 &entry->flags));
+        }
+}
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+        u64 end;
+        u64 orig_end;
+        u64 wait_end;
+        struct btrfs_ordered_extent *ordered;
+        if (start + len < start) {
+                orig_end = INT_LIMIT(loff_t);
+        } else {
+                orig_end = start + len - 1;
+                if (orig_end > INT_LIMIT(loff_t))
+                        orig_end = INT_LIMIT(loff_t);
+        }
+        wait_end = orig_end;
+again:
+        /* start IO across the range first to instantiate any delalloc
+         * extents
+         */
+        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+        /* The compression code will leave pages locked but return from
+         * writepage without setting the page writeback.  Starting again
+         * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+         */
+        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+        btrfs_wait_on_page_writeback_range(inode->i_mapping,
+                                           start >> PAGE_CACHE_SHIFT,
+                                           orig_end >> PAGE_CACHE_SHIFT);
+        end = orig_end;
+        while (1) {
+                ordered = btrfs_lookup_first_ordered_extent(inode, end);
+                if (!ordered)
+                        break;
+                if (ordered->file_offset > orig_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+                if (ordered->file_offset + ordered->len < start) {
+                        btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                end = ordered->file_offset;
+                btrfs_put_ordered_extent(ordered);
+                if (end == 0 || end == start)
+                        break;
+                end--;
+        }
+        if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+                           EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+                schedule_timeout(1);
+                goto again;
+        }
+        return 0;
+}
+/*
+ * find an ordered extent corresponding to file_offset.  return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+                                                         u64 file_offset)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        node = tree_search(tree, file_offset);
+        if (!node)
+                goto out;
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        if (!offset_in_entry(entry, file_offset))
+                entry = NULL;
+        if (entry)
+                atomic_inc(&entry->refs);
+out:
+        mutex_unlock(&tree->mutex);
+        return entry;
+}
+/*
+ * lookup and return any extent before 'file_offset'.  NULL is returned
+ * if none is found
+ */
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        node = tree_search(tree, file_offset);
+        if (!node)
+                goto out;
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        atomic_inc(&entry->refs);
+out:
+        mutex_unlock(&tree->mutex);
+        return entry;
+}
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size.  i_size is updated to cover any fully written part of the file.
+ */
+int btrfs_ordered_update_i_size(struct inode *inode,
+                                struct btrfs_ordered_extent *ordered)
+{
+        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        u64 disk_i_size;
+        u64 new_i_size;
+        u64 i_size_test;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *test;
+        mutex_lock(&tree->mutex);
+        disk_i_size = BTRFS_I(inode)->disk_i_size;
+        /*
+         * if the disk i_size is already at the inode->i_size, or
+         * this ordered extent is inside the disk i_size, we're done
+         */
+        if (disk_i_size >= inode->i_size ||
+            ordered->file_offset + ordered->len <= disk_i_size) {
+                goto out;
+        }
+        /*
+         * we can't update the disk_isize if there are delalloc bytes
+         * between disk_i_size and  this ordered extent
+         */
+        if (test_range_bit(io_tree, disk_i_size,
+                           ordered->file_offset + ordered->len - 1,
+                           EXTENT_DELALLOC, 0)) {
+                goto out;
+        }
+        /*
+         * walk backward from this ordered extent to disk_i_size.
+         * if we find an ordered extent then we can't update disk i_size
+         * yet
+         */
+        node = &ordered->rb_node;
+        while (1) {
+                node = rb_prev(node);
+                if (!node)
+                        break;
+                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+                if (test->file_offset + test->len <= disk_i_size)
+                        break;
+                if (test->file_offset >= inode->i_size)
+                        break;
+                if (test->file_offset >= disk_i_size)
+                        goto out;
+        }
+        new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
+        /*
+         * at this point, we know we can safely update i_size to at least
+         * the offset from this ordered extent.  But, we need to
+         * walk forward and see if ios from higher up in the file have
+         * finished.
+         */
+        node = rb_next(&ordered->rb_node);
+        i_size_test = 0;
+        if (node) {
+                /*
+                 * do we have an area where IO might have finished
+                 * between our ordered extent and the next one.
+                 */
+                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+                if (test->file_offset > entry_end(ordered))
+                        i_size_test = test->file_offset;
+        } else {
+                i_size_test = i_size_read(inode);
+        }
+        /*
+         * i_size_test is the end of a region after this ordered
+         * extent where there are no ordered extents.  As long as there
+         * are no delalloc bytes in this area, it is safe to update
+         * disk_i_size to the end of the region.
+         */
+        if (i_size_test > entry_end(ordered) &&
+            !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
+                           EXTENT_DELALLOC, 0)) {
+                new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+        }
+        BTRFS_I(inode)->disk_i_size = new_i_size;
+out:
+        mutex_unlock(&tree->mutex);
+        return 0;
+}
+/*
+ * search the ordered extents for one corresponding to 'offset' and
+ * try to find a checksum.  This is used because we allow pages to
+ * be reclaimed before their checksum is actually put into the btree
+ */
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+                           u32 *sum)
+{
+        struct btrfs_ordered_sum *ordered_sum;
+        struct btrfs_sector_sum *sector_sums;
+        struct btrfs_ordered_extent *ordered;
+        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+        struct list_head *cur;
+        unsigned long num_sectors;
+        unsigned long i;
+        u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+        int ret = 1;
+        ordered = btrfs_lookup_ordered_extent(inode, offset);
+        if (!ordered)
+                return 1;
+        mutex_lock(&tree->mutex);
+        list_for_each_prev(cur, &ordered->list) {
+                ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
+                if (disk_bytenr >= ordered_sum->bytenr) {
+                        num_sectors = ordered_sum->len / sectorsize;
+                        sector_sums = ordered_sum->sums;
+                        for (i = 0; i < num_sectors; i++) {
+                                if (sector_sums[i].bytenr == disk_bytenr) {
+                                        *sum = sector_sums[i].sum;
+                                        ret = 0;
+                                        goto out;
+                                }
+                        }
+                }
+        }
+out:
+        mutex_unlock(&tree->mutex);
+        btrfs_put_ordered_extent(ordered);
+        return ret;
+}
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping:    address space structure to write
+ * @start:      offset in bytes where the range starts
+ * @end:        offset in bytes where the range ends (inclusive)
+ * @sync_mode:  enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback.  The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+                           loff_t end, int sync_mode)
+{
+        struct writeback_control wbc = {
+                .sync_mode = sync_mode,
+                .nr_to_write = mapping->nrpages * 2,
+                .range_start = start,
+                .range_end = end,
+                .for_writepages = 1,
+        };
+        return btrfs_writepages(mapping, &wbc);
+}
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping:    target address_space
+ * @start:      beginning page index
+ * @end:        ending page index
+ *
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
+ */
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+                                       pgoff_t start, pgoff_t end)
+{
+        struct pagevec pvec;
+        int nr_pages;
+        int ret = 0;
+        pgoff_t index;
+        if (end < start)
+                return 0;
+        pagevec_init(&pvec, 0);
+        index = start;
+        while ((index <= end) &&
+                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                        PAGECACHE_TAG_WRITEBACK,
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+                unsigned i;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /* until radix tree lookup accepts end_index */
+                        if (page->index > end)
+                                continue;
+                        wait_on_page_writeback(page);
+                        if (PageError(page))
+                                ret = -EIO;
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        /* Check for outstanding write errors */
+        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+                ret = -ENOSPC;
+        if (test_and_clear_bit(AS_EIO, &mapping->flags))
+                ret = -EIO;
+        return ret;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..ab66d5e8d6d6
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+/* one of these per inode */
+struct btrfs_ordered_inode_tree {
+        struct mutex mutex;
+        struct rb_root tree;
+        struct rb_node *last;
+};
+/*
+ * these are used to collect checksums done just before bios submission.
+ * They are attached via a list into the ordered extent, and
+ * checksum items are inserted into the tree after all the blocks in
+ * the ordered extent are on disk
+ */
+struct btrfs_sector_sum {
+        /* bytenr on disk */
+        u64 bytenr;
+        u32 sum;
+};
+struct btrfs_ordered_sum {
+        /* bytenr is the start of this extent on disk */
+        u64 bytenr;
+        /*
+         * this is the length in bytes covered by the sums array below.
+         */
+        unsigned long len;
+        struct list_head list;
+        /* last field is a variable length array of btrfs_sector_sums */
+        struct btrfs_sector_sum sums[];
+};
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters.  It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+struct btrfs_ordered_extent {
+        /* logical offset in the file */
+        u64 file_offset;
+        /* disk byte number */
+        u64 start;
+        /* ram length of the extent in bytes */
+        u64 len;
+        /* extent length on disk */
+        u64 disk_len;
+        /* flags (described above) */
+        unsigned long flags;
+        /* reference count */
+        atomic_t refs;
+        /* the inode we belong to */
+        struct inode *inode;
+        /* list of checksums for insertion when the extent io is done */
+        struct list_head list;
+        /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
+        wait_queue_head_t wait;
+        /* our friendly rbtree entry */
+        struct rb_node rb_node;
+        /* a per root list of all the pending ordered extents */
+        struct list_head root_extent_list;
+};
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+                                         unsigned long bytes)
+{
+        unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+                root->sectorsize;
+        num_sectors++;
+        return sizeof(struct btrfs_ordered_sum) +
+                num_sectors * sizeof(struct btrfs_sector_sum);
+}
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+        mutex_init(&t->mutex);
+        t->tree.rb_node = NULL;
+        t->last = NULL;
+}
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+                                struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+                                       u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+                             u64 start, u64 len, u64 disk_len, int tyep);
+int btrfs_add_ordered_sum(struct inode *inode,
+                          struct btrfs_ordered_extent *entry,
+                          struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+                                                         u64 file_offset);
+void btrfs_start_ordered_extent(struct inode *inode,
+                                struct btrfs_ordered_extent *entry, int wait);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+int btrfs_ordered_update_i_size(struct inode *inode,
+                                struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+                                       pgoff_t start, pgoff_t end);
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+                           loff_t end, int sync_mode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 offset)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret = 0;
+        key.objectid = BTRFS_ORPHAN_OBJECTID;
+        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.offset = offset;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 offset)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret = 0;
+        key.objectid = BTRFS_ORPHAN_OBJECTID;
+        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.offset = offset;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret)
+                goto out;
+        ret = btrfs_del_item(trans, root, path);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..5f8f218c1005
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+        int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+        int i;
+        printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+               "num_stripes %d\n",
+               (unsigned long long)btrfs_chunk_length(eb, chunk),
+               (unsigned long long)btrfs_chunk_owner(eb, chunk),
+               (unsigned long long)btrfs_chunk_type(eb, chunk),
+               num_stripes);
+        for (i = 0 ; i < num_stripes ; i++) {
+                printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
+                      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
+                      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+        }
+}
+static void print_dev_item(struct extent_buffer *eb,
+                           struct btrfs_dev_item *dev_item)
+{
+        printk(KERN_INFO "\t\tdev item devid %llu "
+               "total_bytes %llu bytes used %llu\n",
+               (unsigned long long)btrfs_device_id(eb, dev_item),
+               (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
+               (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+}
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
+{
+        int i;
+        u32 nr = btrfs_header_nritems(l);
+        struct btrfs_item *item;
+        struct btrfs_extent_item *ei;
+        struct btrfs_root_item *ri;
+        struct btrfs_dir_item *di;
+        struct btrfs_inode_item *ii;
+        struct btrfs_block_group_item *bi;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_extent_ref *ref;
+        struct btrfs_dev_extent *dev_extent;
+        u32 type;
+        printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
+                (unsigned long long)btrfs_header_bytenr(l), nr,
+                btrfs_leaf_free_space(root, l));
+        for (i = 0 ; i < nr ; i++) {
+                item = btrfs_item_nr(l, i);
+                btrfs_item_key_to_cpu(l, &key, i);
+                type = btrfs_key_type(&key);
+                printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+                       "itemsize %d\n",
+                        i,
+                        (unsigned long long)key.objectid, type,
+                        (unsigned long long)key.offset,
+                        btrfs_item_offset(l, item), btrfs_item_size(l, item));
+                switch (type) {
+                case BTRFS_INODE_ITEM_KEY:
+                        ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+                        printk(KERN_INFO "\t\tinode generation %llu size %llu "
+                               "mode %o\n",
+                               (unsigned long long)
+                               btrfs_inode_generation(l, ii),
+                              (unsigned long long)btrfs_inode_size(l, ii),
+                               btrfs_inode_mode(l, ii));
+                        break;
+                case BTRFS_DIR_ITEM_KEY:
+                        di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+                        btrfs_dir_item_key_to_cpu(l, di, &found_key);
+                        printk(KERN_INFO "\t\tdir oid %llu type %u\n",
+                                (unsigned long long)found_key.objectid,
+                                btrfs_dir_type(l, di));
+                        break;
+                case BTRFS_ROOT_ITEM_KEY:
+                        ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+                        printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+                                (unsigned long long)
+                                btrfs_disk_root_bytenr(l, ri),
+                                btrfs_disk_root_refs(l, ri));
+                        break;
+                case BTRFS_EXTENT_ITEM_KEY:
+                        ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
+                        printk(KERN_INFO "\t\textent data refs %u\n",
+                                btrfs_extent_refs(l, ei));
+                        break;
+                case BTRFS_EXTENT_REF_KEY:
+                        ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
+                        printk(KERN_INFO "\t\textent back ref root %llu "
+                               "gen %llu owner %llu num_refs %lu\n",
+                               (unsigned long long)btrfs_ref_root(l, ref),
+                               (unsigned long long)btrfs_ref_generation(l, ref),
+                               (unsigned long long)btrfs_ref_objectid(l, ref),
+                               (unsigned long)btrfs_ref_num_refs(l, ref));
+                        break;
+                case BTRFS_EXTENT_DATA_KEY:
+                        fi = btrfs_item_ptr(l, i,
+                                            struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_type(l, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE) {
+                                printk(KERN_INFO "\t\tinline extent data "
+                                       "size %u\n",
+                                       btrfs_file_extent_inline_len(l, fi));
+                                break;
+                        }
+                        printk(KERN_INFO "\t\textent data disk bytenr %llu "
+                               "nr %llu\n",
+                               (unsigned long long)
+                               btrfs_file_extent_disk_bytenr(l, fi),
+                               (unsigned long long)
+                               btrfs_file_extent_disk_num_bytes(l, fi));
+                        printk(KERN_INFO "\t\textent data offset %llu "
+                               "nr %llu ram %llu\n",
+                               (unsigned long long)
+                               btrfs_file_extent_offset(l, fi),
+                               (unsigned long long)
+                               btrfs_file_extent_num_bytes(l, fi),
+                               (unsigned long long)
+                               btrfs_file_extent_ram_bytes(l, fi));
+                        break;
+                case BTRFS_BLOCK_GROUP_ITEM_KEY:
+                        bi = btrfs_item_ptr(l, i,
+                                            struct btrfs_block_group_item);
+                        printk(KERN_INFO "\t\tblock group used %llu\n",
+                               (unsigned long long)
+                               btrfs_disk_block_group_used(l, bi));
+                        break;
+                case BTRFS_CHUNK_ITEM_KEY:
+                        print_chunk(l, btrfs_item_ptr(l, i,
+                                                      struct btrfs_chunk));
+                        break;
+                case BTRFS_DEV_ITEM_KEY:
+                        print_dev_item(l, btrfs_item_ptr(l, i,
+                                        struct btrfs_dev_item));
+                        break;
+                case BTRFS_DEV_EXTENT_KEY:
+                        dev_extent = btrfs_item_ptr(l, i,
+                                                    struct btrfs_dev_extent);
+                        printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
+                               "\t\tchunk objectid %llu chunk offset %llu "
+                               "length %llu\n",
+                               (unsigned long long)
+                               btrfs_dev_extent_chunk_tree(l, dev_extent),
+                               (unsigned long long)
+                               btrfs_dev_extent_chunk_objectid(l, dev_extent),
+                               (unsigned long long)
+                               btrfs_dev_extent_chunk_offset(l, dev_extent),
+                               (unsigned long long)
+                               btrfs_dev_extent_length(l, dev_extent));
+                };
+        }
+}
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
+{
+        int i; u32 nr;
+        struct btrfs_key key;
+        int level;
+        if (!c)
+                return;
+        nr = btrfs_header_nritems(c);
+        level = btrfs_header_level(c);
+        if (level == 0) {
+                btrfs_print_leaf(root, c);
+                return;
+        }
+        printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
+               (unsigned long long)btrfs_header_bytenr(c),
+               btrfs_header_level(c), nr,
+               (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+        for (i = 0; i < nr; i++) {
+                btrfs_node_key_to_cpu(c, &key, i);
+                printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
+                       i,
+                       (unsigned long long)key.objectid,
+                       key.type,
+                       (unsigned long long)key.offset,
+                       (unsigned long long)btrfs_node_blockptr(c, i));
+        }
+        for (i = 0; i < nr; i++) {
+                struct extent_buffer *next = read_tree_block(root,
+                                        btrfs_node_blockptr(c, i),
+                                        btrfs_level_size(root, level - 1),
+                                        btrfs_node_ptr_generation(c, i));
+                if (btrfs_is_leaf(next) &&
+                    btrfs_header_level(c) != 1)
+                        BUG();
+                if (btrfs_header_level(next) !=
+                        btrfs_header_level(c) - 1)
+                        BUG();
+                btrfs_print_tree(root, next);
+                free_extent_buffer(next);
+        }
+}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __PRINT_TREE_
+#define __PRINT_TREE_
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..6f0acc4c9eab
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+#include "ref-cache.h"
+#include "transaction.h"
+/*
+ * leaf refs are used to cache the information about which extents
+ * a given leaf has references on.  This allows us to process that leaf
+ * in btrfs_drop_snapshot without needing to read it back from disk.
+ */
+/*
+ * kmalloc a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+                                            int nr_extents)
+{
+        struct btrfs_leaf_ref *ref;
+        size_t size = btrfs_leaf_ref_size(nr_extents);
+        ref = kmalloc(size, GFP_NOFS);
+        if (ref) {
+                spin_lock(&root->fs_info->ref_cache_lock);
+                root->fs_info->total_ref_cache_size += size;
+                spin_unlock(&root->fs_info->ref_cache_lock);
+                memset(ref, 0, sizeof(*ref));
+                atomic_set(&ref->usage, 1);
+                INIT_LIST_HEAD(&ref->list);
+        }
+        return ref;
+}
+/*
+ * free a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+        if (!ref)
+                return;
+        WARN_ON(atomic_read(&ref->usage) == 0);
+        if (atomic_dec_and_test(&ref->usage)) {
+                size_t size = btrfs_leaf_ref_size(ref->nritems);
+                BUG_ON(ref->in_tree);
+                kfree(ref);
+                spin_lock(&root->fs_info->ref_cache_lock);
+                root->fs_info->total_ref_cache_size -= size;
+                spin_unlock(&root->fs_info->ref_cache_lock);
+        }
+}
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct btrfs_leaf_ref *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
+                if (bytenr < entry->bytenr)
+                        p = &(*p)->rb_left;
+                else if (bytenr > entry->bytenr)
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+        struct rb_node *n = root->rb_node;
+        struct btrfs_leaf_ref *entry;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
+                WARN_ON(!entry->in_tree);
+                if (bytenr < entry->bytenr)
+                        n = n->rb_left;
+                else if (bytenr > entry->bytenr)
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        return NULL;
+}
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+                           int shared)
+{
+        struct btrfs_leaf_ref *ref = NULL;
+        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+        if (shared)
+                tree = &root->fs_info->shared_ref_tree;
+        if (!tree)
+                return 0;
+        spin_lock(&tree->lock);
+        while (!list_empty(&tree->list)) {
+                ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
+                BUG_ON(ref->tree != tree);
+                if (ref->root_gen > max_root_gen)
+                        break;
+                if (!xchg(&ref->in_tree, 0)) {
+                        cond_resched_lock(&tree->lock);
+                        continue;
+                }
+                rb_erase(&ref->rb_node, &tree->root);
+                list_del_init(&ref->list);
+                spin_unlock(&tree->lock);
+                btrfs_free_leaf_ref(root, ref);
+                cond_resched();
+                spin_lock(&tree->lock);
+        }
+        spin_unlock(&tree->lock);
+        return 0;
+}
+/*
+ * find the leaf ref for a given extent.  This returns the ref struct with
+ * a usage reference incremented
+ */
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+                                             u64 bytenr)
+{
+        struct rb_node *rb;
+        struct btrfs_leaf_ref *ref = NULL;
+        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+again:
+        if (tree) {
+                spin_lock(&tree->lock);
+                rb = tree_search(&tree->root, bytenr);
+                if (rb)
+                        ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+                if (ref)
+                        atomic_inc(&ref->usage);
+                spin_unlock(&tree->lock);
+                if (ref)
+                        return ref;
+        }
+        if (tree != &root->fs_info->shared_ref_tree) {
+                tree = &root->fs_info->shared_ref_tree;
+                goto again;
+        }
+        return NULL;
+}
+/*
+ * add a fully filled in leaf ref struct
+ * remove all the refs older than a given root generation
+ */
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+                       int shared)
+{
+        int ret = 0;
+        struct rb_node *rb;
+        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+        if (shared)
+                tree = &root->fs_info->shared_ref_tree;
+        spin_lock(&tree->lock);
+        rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
+        if (rb) {
+                ret = -EEXIST;
+        } else {
+                atomic_inc(&ref->usage);
+                ref->tree = tree;
+                ref->in_tree = 1;
+                list_add_tail(&ref->list, &tree->list);
+        }
+        spin_unlock(&tree->lock);
+        return ret;
+}
+/*
+ * remove a single leaf ref from the tree.  This drops the ref held by the tree
+ * only
+ */
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+        struct btrfs_leaf_ref_tree *tree;
+        if (!xchg(&ref->in_tree, 0))
+                return 0;
+        tree = ref->tree;
+        spin_lock(&tree->lock);
+        rb_erase(&ref->rb_node, &tree->root);
+        list_del_init(&ref->list);
+        spin_unlock(&tree->lock);
+        btrfs_free_leaf_ref(root, ref);
+        return 0;
+}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..16f3183d7c59
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __REFCACHE__
+#define __REFCACHE__
+struct btrfs_extent_info {
+        /* bytenr and num_bytes find the extent in the extent allocation tree */
+        u64 bytenr;
+        u64 num_bytes;
+        /* objectid and offset find the back reference for the file */
+        u64 objectid;
+        u64 offset;
+};
+struct btrfs_leaf_ref {
+        struct rb_node rb_node;
+        struct btrfs_leaf_ref_tree *tree;
+        int in_tree;
+        atomic_t usage;
+        u64 root_gen;
+        u64 bytenr;
+        u64 owner;
+        u64 generation;
+        int nritems;
+        struct list_head list;
+        struct btrfs_extent_info extents[];
+};
+static inline size_t btrfs_leaf_ref_size(int nr_extents)
+{
+        return sizeof(struct btrfs_leaf_ref) +
+               sizeof(struct btrfs_extent_info) * nr_extents;
+}
+static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
+{
+        tree->root.rb_node = NULL;
+        INIT_LIST_HEAD(&tree->list);
+        spin_lock_init(&tree->lock);
+}
+static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
+{
+        return RB_EMPTY_ROOT(&tree->root);
+}
+void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+                                            int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+                                             u64 bytenr);
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+                       int shared);
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+                           int shared);
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..b48650de4472
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+/*
+ *  search forward for a root, starting with objectid 'search_start'
+ *  if a root key is found, the objectid we find is filled into 'found_objectid'
+ *  and 0 is returned.  < 0 is returned on error, 1 if there is nothing
+ *  left in the tree.
+ */
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+                      u64 *found_objectid)
+{
+        struct btrfs_path *path;
+        struct btrfs_key search_key;
+        int ret;
+        root = root->fs_info->tree_root;
+        search_key.objectid = search_start;
+        search_key.type = (u8)-1;
+        search_key.offset = (u64)-1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+again:
+        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret == 0) {
+                ret = 1;
+                goto out;
+        }
+        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+                ret = btrfs_next_leaf(root, path);
+                if (ret)
+                        goto out;
+        }
+        btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
+        if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
+                search_key.offset++;
+                btrfs_release_path(root, path);
+                goto again;
+        }
+        ret = 0;
+        *found_objectid = search_key.objectid;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * lookup the root with the highest offset for a given objectid.  The key we do
+ * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
+ * on error.
+ */
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+                        struct btrfs_root_item *item, struct btrfs_key *key)
+{
+        struct btrfs_path *path;
+        struct btrfs_key search_key;
+        struct btrfs_key found_key;
+        struct extent_buffer *l;
+        int ret;
+        int slot;
+        search_key.objectid = objectid;
+        search_key.type = BTRFS_ROOT_ITEM_KEY;
+        search_key.offset = (u64)-1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret == 0);
+        l = path->nodes[0];
+        BUG_ON(path->slots[0] == 0);
+        slot = path->slots[0] - 1;
+        btrfs_item_key_to_cpu(l, &found_key, slot);
+        if (found_key.objectid != objectid) {
+                ret = 1;
+                goto out;
+        }
+        read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+                           sizeof(*item));
+        memcpy(key, &found_key, sizeof(found_key));
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * copy the data in 'item' into the btree
+ */
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_root_item
+                      *item)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *l;
+        int ret;
+        int slot;
+        unsigned long ptr;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+        if (ret < 0)
+                goto out;
+        if (ret != 0) {
+                btrfs_print_leaf(root, path->nodes[0]);
+                printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+                       (unsigned long long)key->objectid, key->type,
+                       (unsigned long long)key->offset);
+                BUG_ON(1);
+        }
+        l = path->nodes[0];
+        slot = path->slots[0];
+        ptr = btrfs_item_ptr_offset(l, slot);
+        write_extent_buffer(l, item, ptr, sizeof(*item));
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_root_item
+                      *item)
+{
+        int ret;
+        ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
+        return ret;
+}
+/*
+ * at mount time we want to find all the old transaction snapshots that were in
+ * the process of being deleted if we crashed.  This is any root item with an
+ * offset lower than the latest root.  They need to be queued for deletion to
+ * finish what was happening when we crashed.
+ */
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+                          struct btrfs_root *latest)
+{
+        struct btrfs_root *dead_root;
+        struct btrfs_item *item;
+        struct btrfs_root_item *ri;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_path *path;
+        int ret;
+        u32 nritems;
+        struct extent_buffer *leaf;
+        int slot;
+        key.objectid = objectid;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.offset = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+again:
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto err;
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                slot = path->slots[0];
+                if (slot >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret)
+                                break;
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        slot = path->slots[0];
+                }
+                item = btrfs_item_nr(leaf, slot);
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
+                        goto next;
+                if (key.objectid < objectid)
+                        goto next;
+                if (key.objectid > objectid)
+                        break;
+                ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
+                if (btrfs_disk_root_refs(leaf, ri) != 0)
+                        goto next;
+                memcpy(&found_key, &key, sizeof(key));
+                key.offset++;
+                btrfs_release_path(root, path);
+                dead_root =
+                        btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+                                                    &found_key);
+                if (IS_ERR(dead_root)) {
+                        ret = PTR_ERR(dead_root);
+                        goto err;
+                }
+                if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+                        ret = btrfs_add_dead_reloc_root(dead_root);
+                else
+                        ret = btrfs_add_dead_root(dead_root, latest);
+                if (ret)
+                        goto err;
+                goto again;
+next:
+                slot++;
+                path->slots[0]++;
+        }
+        ret = 0;
+err:
+        btrfs_free_path(path);
+        return ret;
+}
+/* drop the root item for 'key' from 'root' */
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_key *key)
+{
+        struct btrfs_path *path;
+        int ret;
+        u32 refs;
+        struct btrfs_root_item *ri;
+        struct extent_buffer *leaf;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_search_slot(trans, root, key, path, -1, 1);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret != 0);
+        leaf = path->nodes[0];
+        ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
+        refs = btrfs_disk_root_refs(leaf, ri);
+        BUG_ON(refs != 0);
+        ret = btrfs_del_item(trans, root, path);
+out:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        return ret;
+}
+#if 0 /* this will get used when snapshot deletion is implemented */
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *tree_root,
+                       u64 root_id, u8 type, u64 ref_id)
+{
+        struct btrfs_key key;
+        int ret;
+        struct btrfs_path *path;
+        path = btrfs_alloc_path();
+        key.objectid = root_id;
+        key.type = type;
+        key.offset = ref_id;
+        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+        BUG_ON(ret);
+        ret = btrfs_del_item(trans, tree_root, path);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return ret;
+}
+#endif
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+                   struct btrfs_path *path,
+                   u64 root_id, u64 ref_id)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = root_id;
+        key.type = BTRFS_ROOT_REF_KEY;
+        key.offset = ref_id;
+        ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+        return ret;
+}
+/*
+ * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol  or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *tree_root,
+                       u64 root_id, u8 type, u64 ref_id,
+                       u64 dirid, u64 sequence,
+                       const char *name, int name_len)
+{
+        struct btrfs_key key;
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_root_ref *ref;
+        struct extent_buffer *leaf;
+        unsigned long ptr;
+        path = btrfs_alloc_path();
+        key.objectid = root_id;
+        key.type = type;
+        key.offset = ref_id;
+        ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+                                      sizeof(*ref) + name_len);
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+        btrfs_set_root_ref_dirid(leaf, ref, dirid);
+        btrfs_set_root_ref_sequence(leaf, ref, sequence);
+        btrfs_set_root_ref_name_len(leaf, ref, name_len);
+        ptr = (unsigned long)(ref + 1);
+        write_extent_buffer(leaf, name, ptr, name_len);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        return ret;
+}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..c0f7ecaf1e79
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/highmem.h>
+/* this is some deeply nasty code.  ctree.h has a different
+ * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions
+ *
+ * This file declares the macros and then #includes ctree.h, which results
+ * in cpp creating the function here based on the template below.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers.  Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type.  This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do all the kmapping and page
+ * spanning work required to get extent buffers in highmem and have
+ * a metadata blocksize different from the page size.
+ *
+ * The macro starts with a simple function prototype declaration so that
+ * sparse won't complain about it being static.
+ */
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)                    \
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);                \
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);  \
+u##bits btrfs_##name(struct extent_buffer *eb,                          \
+                                   type *s)                             \
+{                                                                       \
+        unsigned long part_offset = (unsigned long)s;                   \
+        unsigned long offset = part_offset + offsetof(type, member);    \
+        type *p;                                                        \
+        /* ugly, but we want the fast path here */                      \
+        if (eb->map_token && offset >= eb->map_start &&                 \
+            offset + sizeof(((type *)0)->member) <= eb->map_start +     \
+            eb->map_len) {                                              \
+                p = (type *)(eb->kaddr + part_offset - eb->map_start);  \
+                return le##bits##_to_cpu(p->member);                    \
+        }                                                               \
+        {                                                               \
+                int err;                                                \
+                char *map_token;                                        \
+                char *kaddr;                                            \
+                int unmap_on_exit = (eb->map_token == NULL);            \
+                unsigned long map_start;                                \
+                unsigned long map_len;                                  \
+                u##bits res;                                            \
+                err = map_extent_buffer(eb, offset,                     \
+                                sizeof(((type *)0)->member),            \
+                                &map_token, &kaddr,                     \
+                                &map_start, &map_len, KM_USER1);        \
+                if (err) {                                              \
+                        __le##bits leres;                               \
+                        read_eb_member(eb, s, type, member, &leres);    \
+                        return le##bits##_to_cpu(leres);                \
+                }                                                       \
+                p = (type *)(kaddr + part_offset - map_start);          \
+                res = le##bits##_to_cpu(p->member);                     \
+                if (unmap_on_exit)                                      \
+                        unmap_extent_buffer(eb, map_token, KM_USER1);   \
+                return res;                                             \
+        }                                                               \
+}                                                                       \
+void btrfs_set_##name(struct extent_buffer *eb,                         \
+                                    type *s, u##bits val)               \
+{                                                                       \
+        unsigned long part_offset = (unsigned long)s;                   \
+        unsigned long offset = part_offset + offsetof(type, member);    \
+        type *p;                                                        \
+        /* ugly, but we want the fast path here */                      \
+        if (eb->map_token && offset >= eb->map_start &&                 \
+            offset + sizeof(((type *)0)->member) <= eb->map_start +     \
+            eb->map_len) {                                              \
+                p = (type *)(eb->kaddr + part_offset - eb->map_start);  \
+                p->member = cpu_to_le##bits(val);                       \
+                return;                                                 \
+        }                                                               \
+        {                                                               \
+                int err;                                                \
+                char *map_token;                                        \
+                char *kaddr;                                            \
+                int unmap_on_exit = (eb->map_token == NULL);            \
+                unsigned long map_start;                                \
+                unsigned long map_len;                                  \
+                err = map_extent_buffer(eb, offset,                     \
+                                sizeof(((type *)0)->member),            \
+                                &map_token, &kaddr,                     \
+                                &map_start, &map_len, KM_USER1);        \
+                if (err) {                                              \
+                        __le##bits val2;                                \
+                        val2 = cpu_to_le##bits(val);                    \
+                        write_eb_member(eb, s, type, member, &val2);    \
+                        return;                                         \
+                }                                                       \
+                p = (type *)(kaddr + part_offset - map_start);          \
+                p->member = cpu_to_le##bits(val);                       \
+                if (unmap_on_exit)                                      \
+                        unmap_extent_buffer(eb, map_token, KM_USER1);   \
+        }                                                               \
+}
+#include "ctree.h"
+void btrfs_node_key(struct extent_buffer *eb,
+                    struct btrfs_disk_key *disk_key, int nr)
+{
+        unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+        if (eb->map_token && ptr >= eb->map_start &&
+            ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
+                memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
+                        sizeof(*disk_key));
+                return;
+        } else if (eb->map_token) {
+                unmap_extent_buffer(eb, eb->map_token, KM_USER1);
+                eb->map_token = NULL;
+        }
+        read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+                       struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..db9fb3bc1e33
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,723 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/parser.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/miscdevice.h>
+#include <linux/version.h>
+#include <linux/magic.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "xattr.h"
+#include "volumes.h"
+#include "version.h"
+#include "export.h"
+#include "compression.h"
+static struct super_operations btrfs_super_ops;
+static void btrfs_put_super(struct super_block *sb)
+{
+        struct btrfs_root *root = btrfs_sb(sb);
+        int ret;
+        ret = close_ctree(root);
+        sb->s_fs_info = NULL;
+}
+enum {
+        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
+        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
+};
+static match_table_t tokens = {
+        {Opt_degraded, "degraded"},
+        {Opt_subvol, "subvol=%s"},
+        {Opt_device, "device=%s"},
+        {Opt_nodatasum, "nodatasum"},
+        {Opt_nodatacow, "nodatacow"},
+        {Opt_nobarrier, "nobarrier"},
+        {Opt_max_extent, "max_extent=%s"},
+        {Opt_max_inline, "max_inline=%s"},
+        {Opt_alloc_start, "alloc_start=%s"},
+        {Opt_thread_pool, "thread_pool=%d"},
+        {Opt_compress, "compress"},
+        {Opt_ssd, "ssd"},
+        {Opt_noacl, "noacl"},
+        {Opt_err, NULL},
+};
+u64 btrfs_parse_size(char *str)
+{
+        u64 res;
+        int mult = 1;
+        char *end;
+        char last;
+        res = simple_strtoul(str, &end, 10);
+        last = end[0];
+        if (isalpha(last)) {
+                last = tolower(last);
+                switch (last) {
+                case 'g':
+                        mult *= 1024;
+                case 'm':
+                        mult *= 1024;
+                case 'k':
+                        mult *= 1024;
+                }
+                res = res * mult;
+        }
+        return res;
+}
+/*
+ * Regular mount options parser.  Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ */
+int btrfs_parse_options(struct btrfs_root *root, char *options)
+{
+        struct btrfs_fs_info *info = root->fs_info;
+        substring_t args[MAX_OPT_ARGS];
+        char *p, *num;
+        int intarg;
+        if (!options)
+                return 0;
+        /*
+         * strsep changes the string, duplicate it because parse_options
+         * gets called twice
+         */
+        options = kstrdup(options, GFP_NOFS);
+        if (!options)
+                return -ENOMEM;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_degraded:
+                        printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+                        btrfs_set_opt(info->mount_opt, DEGRADED);
+                        break;
+                case Opt_subvol:
+                case Opt_device:
+                        /*
+                         * These are parsed by btrfs_parse_early_options
+                         * and can be happily ignored here.
+                         */
+                        break;
+                case Opt_nodatasum:
+                        printk(KERN_INFO "btrfs: setting nodatacsum\n");
+                        btrfs_set_opt(info->mount_opt, NODATASUM);
+                        break;
+                case Opt_nodatacow:
+                        printk(KERN_INFO "btrfs: setting nodatacow\n");
+                        btrfs_set_opt(info->mount_opt, NODATACOW);
+                        btrfs_set_opt(info->mount_opt, NODATASUM);
+                        break;
+                case Opt_compress:
+                        printk(KERN_INFO "btrfs: use compression\n");
+                        btrfs_set_opt(info->mount_opt, COMPRESS);
+                        break;
+                case Opt_ssd:
+                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
+                        btrfs_set_opt(info->mount_opt, SSD);
+                        break;
+                case Opt_nobarrier:
+                        printk(KERN_INFO "btrfs: turning off barriers\n");
+                        btrfs_set_opt(info->mount_opt, NOBARRIER);
+                        break;
+                case Opt_thread_pool:
+                        intarg = 0;
+                        match_int(&args[0], &intarg);
+                        if (intarg) {
+                                info->thread_pool_size = intarg;
+                                printk(KERN_INFO "btrfs: thread pool %d\n",
+                                       info->thread_pool_size);
+                        }
+                        break;
+                case Opt_max_extent:
+                        num = match_strdup(&args[0]);
+                        if (num) {
+                                info->max_extent = btrfs_parse_size(num);
+                                kfree(num);
+                                info->max_extent = max_t(u64,
+                                        info->max_extent, root->sectorsize);
+                                printk(KERN_INFO "btrfs: max_extent at %llu\n",
+                                       info->max_extent);
+                        }
+                        break;
+                case Opt_max_inline:
+                        num = match_strdup(&args[0]);
+                        if (num) {
+                                info->max_inline = btrfs_parse_size(num);
+                                kfree(num);
+                                if (info->max_inline) {
+                                        info->max_inline = max_t(u64,
+                                                info->max_inline,
+                                                root->sectorsize);
+                                }
+                                printk(KERN_INFO "btrfs: max_inline at %llu\n",
+                                        info->max_inline);
+                        }
+                        break;
+                case Opt_alloc_start:
+                        num = match_strdup(&args[0]);
+                        if (num) {
+                                info->alloc_start = btrfs_parse_size(num);
+                                kfree(num);
+                                printk(KERN_INFO
+                                        "btrfs: allocations start at %llu\n",
+                                        info->alloc_start);
+                        }
+                        break;
+                case Opt_noacl:
+                        root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+                        break;
+                default:
+                        break;
+                }
+        }
+        kfree(options);
+        return 0;
+}
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
+                void *holder, char **subvol_name,
+                struct btrfs_fs_devices **fs_devices)
+{
+        substring_t args[MAX_OPT_ARGS];
+        char *opts, *p;
+        int error = 0;
+        if (!options)
+                goto out;
+        /*
+         * strsep changes the string, duplicate it because parse_options
+         * gets called twice
+         */
+        opts = kstrdup(options, GFP_KERNEL);
+        if (!opts)
+                return -ENOMEM;
+        while ((p = strsep(&opts, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_subvol:
+                        *subvol_name = match_strdup(&args[0]);
+                        break;
+                case Opt_device:
+                        error = btrfs_scan_one_device(match_strdup(&args[0]),
+                                        flags, holder, fs_devices);
+                        if (error)
+                                goto out_free_opts;
+                        break;
+                default:
+                        break;
+                }
+        }
+ out_free_opts:
+        kfree(opts);
+ out:
+        /*
+         * If no subvolume name is specified we use the default one.  Allocate
+         * a copy of the string "." here so that code later in the
+         * mount path doesn't care if it's the default volume or another one.
+         */
+        if (!*subvol_name) {
+                *subvol_name = kstrdup(".", GFP_KERNEL);
+                if (!*subvol_name)
+                        return -ENOMEM;
+        }
+        return error;
+}
+static int btrfs_fill_super(struct super_block *sb,
+                            struct btrfs_fs_devices *fs_devices,
+                            void *data, int silent)
+{
+        struct inode *inode;
+        struct dentry *root_dentry;
+        struct btrfs_super_block *disk_super;
+        struct btrfs_root *tree_root;
+        struct btrfs_inode *bi;
+        int err;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_magic = BTRFS_SUPER_MAGIC;
+        sb->s_op = &btrfs_super_ops;
+        sb->s_export_op = &btrfs_export_ops;
+        sb->s_xattr = btrfs_xattr_handlers;
+        sb->s_time_gran = 1;
+        sb->s_flags |= MS_POSIXACL;
+        tree_root = open_ctree(sb, fs_devices, (char *)data);
+        if (IS_ERR(tree_root)) {
+                printk("btrfs: open_ctree failed\n");
+                return PTR_ERR(tree_root);
+        }
+        sb->s_fs_info = tree_root;
+        disk_super = &tree_root->fs_info->super_copy;
+        inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
+                                  tree_root->fs_info->fs_root);
+        bi = BTRFS_I(inode);
+        bi->location.objectid = inode->i_ino;
+        bi->location.offset = 0;
+        bi->root = tree_root->fs_info->fs_root;
+        btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
+        if (!inode) {
+                err = -ENOMEM;
+                goto fail_close;
+        }
+        if (inode->i_state & I_NEW) {
+                btrfs_read_locked_inode(inode);
+                unlock_new_inode(inode);
+        }
+        root_dentry = d_alloc_root(inode);
+        if (!root_dentry) {
+                iput(inode);
+                err = -ENOMEM;
+                goto fail_close;
+        }
+#if 0
+        /* this does the super kobj at the same time */
+        err = btrfs_sysfs_add_super(tree_root->fs_info);
+        if (err)
+                goto fail_close;
+#endif
+        sb->s_root = root_dentry;
+        save_mount_options(sb, data);
+        return 0;
+fail_close:
+        close_ctree(tree_root);
+        return err;
+}
+int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        int ret;
+        root = btrfs_sb(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return 0;
+        sb->s_dirt = 0;
+        if (!wait) {
+                filemap_flush(root->fs_info->btree_inode->i_mapping);
+                return 0;
+        }
+        btrfs_start_delalloc_inodes(root);
+        btrfs_wait_ordered_extents(root, 0);
+        btrfs_clean_old_snapshots(root);
+        trans = btrfs_start_transaction(root, 1);
+        ret = btrfs_commit_transaction(trans, root);
+        sb->s_dirt = 0;
+        return ret;
+}
+static void btrfs_write_super(struct super_block *sb)
+{
+        sb->s_dirt = 0;
+}
+static int btrfs_test_super(struct super_block *s, void *data)
+{
+        struct btrfs_fs_devices *test_fs_devices = data;
+        struct btrfs_root *root = btrfs_sb(s);
+        return root->fs_info->fs_devices == test_fs_devices;
+}
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
+ *        for multiple device setup.  Make sure to keep it in sync.
+ */
+static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+                const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        char *subvol_name = NULL;
+        struct block_device *bdev = NULL;
+        struct super_block *s;
+        struct dentry *root;
+        struct btrfs_fs_devices *fs_devices = NULL;
+        fmode_t mode = FMODE_READ;
+        int error = 0;
+        if (!(flags & MS_RDONLY))
+                mode |= FMODE_WRITE;
+        error = btrfs_parse_early_options(data, mode, fs_type,
+                                          &subvol_name, &fs_devices);
+        if (error)
+                return error;
+        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
+        if (error)
+                goto error_free_subvol_name;
+        error = btrfs_open_devices(fs_devices, mode, fs_type);
+        if (error)
+                goto error_free_subvol_name;
+        if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+                error = -EACCES;
+                goto error_close_devices;
+        }
+        bdev = fs_devices->latest_bdev;
+        s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+        if (IS_ERR(s))
+                goto error_s;
+        if (s->s_root) {
+                if ((flags ^ s->s_flags) & MS_RDONLY) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        error = -EBUSY;
+                        goto error_close_devices;
+                }
+                btrfs_close_devices(fs_devices);
+        } else {
+                char b[BDEVNAME_SIZE];
+                s->s_flags = flags;
+                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+                error = btrfs_fill_super(s, fs_devices, data,
+                                         flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        goto error_free_subvol_name;
+                }
+                btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+                s->s_flags |= MS_ACTIVE;
+        }
+        if (!strcmp(subvol_name, "."))
+                root = dget(s->s_root);
+        else {
+                mutex_lock(&s->s_root->d_inode->i_mutex);
+                root = lookup_one_len(subvol_name, s->s_root,
+                                      strlen(subvol_name));
+                mutex_unlock(&s->s_root->d_inode->i_mutex);
+                if (IS_ERR(root)) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        error = PTR_ERR(root);
+                        goto error_free_subvol_name;
+                }
+                if (!root->d_inode) {
+                        dput(root);
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        error = -ENXIO;
+                        goto error_free_subvol_name;
+                }
+        }
+        mnt->mnt_sb = s;
+        mnt->mnt_root = root;
+        kfree(subvol_name);
+        return 0;
+error_s:
+        error = PTR_ERR(s);
+error_close_devices:
+        btrfs_close_devices(fs_devices);
+error_free_subvol_name:
+        kfree(subvol_name);
+        return error;
+}
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        struct btrfs_root *root = btrfs_sb(sb);
+        int ret;
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+                return 0;
+        if (*flags & MS_RDONLY) {
+                sb->s_flags |= MS_RDONLY;
+                ret =  btrfs_commit_super(root);
+                WARN_ON(ret);
+        } else {
+                if (root->fs_info->fs_devices->rw_devices == 0)
+                        return -EACCES;
+                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+                        return -EINVAL;
+                ret = btrfs_cleanup_reloc_trees(root);
+                WARN_ON(ret);
+                ret = btrfs_cleanup_fs_roots(root->fs_info);
+                WARN_ON(ret);
+                sb->s_flags &= ~MS_RDONLY;
+        }
+        return 0;
+}
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+        int bits = dentry->d_sb->s_blocksize_bits;
+        __be32 *fsid = (__be32 *)root->fs_info->fsid;
+        buf->f_namelen = BTRFS_NAME_LEN;
+        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
+        buf->f_bfree = buf->f_blocks -
+                (btrfs_super_bytes_used(disk_super) >> bits);
+        buf->f_bavail = buf->f_bfree;
+        buf->f_bsize = dentry->d_sb->s_blocksize;
+        buf->f_type = BTRFS_SUPER_MAGIC;
+        /* We treat it as constant endianness (it doesn't matter _which_)
+           because we want the fsid to come out the same whether mounted
+           on a big-endian or little-endian host */
+        buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+        buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+        /* Mask in the root object ID too, to disambiguate subvols */
+        buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
+        buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
+        return 0;
+}
+static struct file_system_type btrfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "btrfs",
+        .get_sb         = btrfs_get_sb,
+        .kill_sb        = kill_anon_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+                                unsigned long arg)
+{
+        struct btrfs_ioctl_vol_args *vol;
+        struct btrfs_fs_devices *fs_devices;
+        int ret = -ENOTTY;
+        int len;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+        if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
+        switch (cmd) {
+        case BTRFS_IOC_SCAN_DEV:
+                ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+                                            &btrfs_fs_type, &fs_devices);
+                break;
+        }
+out:
+        kfree(vol);
+        return ret;
+}
+static int btrfs_freeze(struct super_block *sb)
+{
+        struct btrfs_root *root = btrfs_sb(sb);
+        mutex_lock(&root->fs_info->transaction_kthread_mutex);
+        mutex_lock(&root->fs_info->cleaner_mutex);
+        return 0;
+}
+static int btrfs_unfreeze(struct super_block *sb)
+{
+        struct btrfs_root *root = btrfs_sb(sb);
+        mutex_unlock(&root->fs_info->cleaner_mutex);
+        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+        return 0;
+}
+static struct super_operations btrfs_super_ops = {
+        .delete_inode   = btrfs_delete_inode,
+        .put_super      = btrfs_put_super,
+        .write_super    = btrfs_write_super,
+        .sync_fs        = btrfs_sync_fs,
+        .show_options   = generic_show_options,
+        .write_inode    = btrfs_write_inode,
+        .dirty_inode    = btrfs_dirty_inode,
+        .alloc_inode    = btrfs_alloc_inode,
+        .destroy_inode  = btrfs_destroy_inode,
+        .statfs         = btrfs_statfs,
+        .remount_fs     = btrfs_remount,
+        .freeze_fs      = btrfs_freeze,
+        .unfreeze_fs    = btrfs_unfreeze,
+};
+static const struct file_operations btrfs_ctl_fops = {
+        .unlocked_ioctl  = btrfs_control_ioctl,
+        .compat_ioctl = btrfs_control_ioctl,
+        .owner   = THIS_MODULE,
+};
+static struct miscdevice btrfs_misc = {
+        .minor          = MISC_DYNAMIC_MINOR,
+        .name           = "btrfs-control",
+        .fops           = &btrfs_ctl_fops
+};
+static int btrfs_interface_init(void)
+{
+        return misc_register(&btrfs_misc);
+}
+static void btrfs_interface_exit(void)
+{
+        if (misc_deregister(&btrfs_misc) < 0)
+                printk(KERN_INFO "misc_deregister failed for control device");
+}
+static int __init init_btrfs_fs(void)
+{
+        int err;
+        err = btrfs_init_sysfs();
+        if (err)
+                return err;
+        err = btrfs_init_cachep();
+        if (err)
+                goto free_sysfs;
+        err = extent_io_init();
+        if (err)
+                goto free_cachep;
+        err = extent_map_init();
+        if (err)
+                goto free_extent_io;
+        err = btrfs_interface_init();
+        if (err)
+                goto free_extent_map;
+        err = register_filesystem(&btrfs_fs_type);
+        if (err)
+                goto unregister_ioctl;
+        printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
+        return 0;
+unregister_ioctl:
+        btrfs_interface_exit();
+free_extent_map:
+        extent_map_exit();
+free_extent_io:
+        extent_io_exit();
+free_cachep:
+        btrfs_destroy_cachep();
+free_sysfs:
+        btrfs_exit_sysfs();
+        return err;
+}
+static void __exit exit_btrfs_fs(void)
+{
+        btrfs_destroy_cachep();
+        extent_map_exit();
+        extent_io_exit();
+        btrfs_interface_exit();
+        unregister_filesystem(&btrfs_fs_type);
+        btrfs_exit_sysfs();
+        btrfs_cleanup_fs_uuids();
+        btrfs_zlib_exit();
+}
+module_init(init_btrfs_fs)
+module_exit(exit_btrfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..a240b6fa81df
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_root_used(&root->root_item));
+}
+static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_root_limit(&root->root_item));
+}
+static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
+}
+static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
+}
+static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
+}
+/* this is for root attrs (subvols/snapshots) */
+struct btrfs_root_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct btrfs_root *, char *);
+        ssize_t (*store)(struct btrfs_root *, const char *, size_t);
+};
+#define ROOT_ATTR(name, mode, show, store) \
+static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
+                                                              show, store)
+ROOT_ATTR(blocks_used,  0444,   root_blocks_used_show,  NULL);
+ROOT_ATTR(block_limit,  0644,   root_block_limit_show,  NULL);
+static struct attribute *btrfs_root_attrs[] = {
+        &btrfs_root_attr_blocks_used.attr,
+        &btrfs_root_attr_block_limit.attr,
+        NULL,
+};
+/* this is for super attrs (actual full fs) */
+struct btrfs_super_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct btrfs_fs_info *, char *);
+        ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
+};
+#define SUPER_ATTR(name, mode, show, store) \
+static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
+                                                                show, store)
+SUPER_ATTR(blocks_used,         0444,   super_blocks_used_show,         NULL);
+SUPER_ATTR(total_blocks,        0444,   super_total_blocks_show,        NULL);
+SUPER_ATTR(blocksize,           0444,   super_blocksize_show,           NULL);
+static struct attribute *btrfs_super_attrs[] = {
+        &btrfs_super_attr_blocks_used.attr,
+        &btrfs_super_attr_total_blocks.attr,
+        &btrfs_super_attr_blocksize.attr,
+        NULL,
+};
+static ssize_t btrfs_super_attr_show(struct kobject *kobj,
+                                    struct attribute *attr, char *buf)
+{
+        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+                                                super_kobj);
+        struct btrfs_super_attr *a = container_of(attr,
+                                                  struct btrfs_super_attr,
+                                                  attr);
+        return a->show ? a->show(fs, buf) : 0;
+}
+static ssize_t btrfs_super_attr_store(struct kobject *kobj,
+                                     struct attribute *attr,
+                                     const char *buf, size_t len)
+{
+        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+                                                super_kobj);
+        struct btrfs_super_attr *a = container_of(attr,
+                                                  struct btrfs_super_attr,
+                                                  attr);
+        return a->store ? a->store(fs, buf, len) : 0;
+}
+static ssize_t btrfs_root_attr_show(struct kobject *kobj,
+                                    struct attribute *attr, char *buf)
+{
+        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+                                                root_kobj);
+        struct btrfs_root_attr *a = container_of(attr,
+                                                 struct btrfs_root_attr,
+                                                 attr);
+        return a->show ? a->show(root, buf) : 0;
+}
+static ssize_t btrfs_root_attr_store(struct kobject *kobj,
+                                     struct attribute *attr,
+                                     const char *buf, size_t len)
+{
+        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+                                                root_kobj);
+        struct btrfs_root_attr *a = container_of(attr,
+                                                 struct btrfs_root_attr,
+                                                 attr);
+        return a->store ? a->store(root, buf, len) : 0;
+}
+static void btrfs_super_release(struct kobject *kobj)
+{
+        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+                                                super_kobj);
+        complete(&fs->kobj_unregister);
+}
+static void btrfs_root_release(struct kobject *kobj)
+{
+        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+                                                root_kobj);
+        complete(&root->kobj_unregister);
+}
+static struct sysfs_ops btrfs_super_attr_ops = {
+        .show   = btrfs_super_attr_show,
+        .store  = btrfs_super_attr_store,
+};
+static struct sysfs_ops btrfs_root_attr_ops = {
+        .show   = btrfs_root_attr_show,
+        .store  = btrfs_root_attr_store,
+};
+static struct kobj_type btrfs_root_ktype = {
+        .default_attrs  = btrfs_root_attrs,
+        .sysfs_ops      = &btrfs_root_attr_ops,
+        .release        = btrfs_root_release,
+};
+static struct kobj_type btrfs_super_ktype = {
+        .default_attrs  = btrfs_super_attrs,
+        .sysfs_ops      = &btrfs_super_attr_ops,
+        .release        = btrfs_super_release,
+};
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
+{
+        int error;
+        char *name;
+        char c;
+        int len = strlen(fs->sb->s_id) + 1;
+        int i;
+        name = kmalloc(len, GFP_NOFS);
+        if (!name) {
+                error = -ENOMEM;
+                goto fail;
+        }
+        for (i = 0; i < len; i++) {
+                c = fs->sb->s_id[i];
+                if (c == '/' || c == '\\')
+                        c = '!';
+                name[i] = c;
+        }
+        name[len] = '\0';
+        fs->super_kobj.kset = btrfs_kset;
+        error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
+                                     NULL, "%s", name);
+        kfree(name);
+        if (error)
+                goto fail;
+        return 0;
+fail:
+        printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
+        return error;
+}
+int btrfs_sysfs_add_root(struct btrfs_root *root)
+{
+        int error;
+        error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
+                                     &root->fs_info->super_kobj,
+                                     "%s", root->name);
+        if (error)
+                goto fail;
+        return 0;
+fail:
+        printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
+        return error;
+}
+void btrfs_sysfs_del_root(struct btrfs_root *root)
+{
+        kobject_put(&root->root_kobj);
+        wait_for_completion(&root->kobj_unregister);
+}
+void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
+{
+        kobject_put(&fs->super_kobj);
+        wait_for_completion(&fs->kobj_unregister);
+}
+int btrfs_init_sysfs(void)
+{
+        btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+        if (!btrfs_kset)
+                return -ENOMEM;
+        return 0;
+}
+void btrfs_exit_sysfs(void)
+{
+        kset_unregister(btrfs_kset);
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..8a08f9443340
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1097 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "tree-log.h"
+#define BTRFS_ROOT_TRANS_TAG 0
+static noinline void put_transaction(struct btrfs_transaction *transaction)
+{
+        WARN_ON(transaction->use_count == 0);
+        transaction->use_count--;
+        if (transaction->use_count == 0) {
+                list_del_init(&transaction->list);
+                memset(transaction, 0, sizeof(*transaction));
+                kmem_cache_free(btrfs_transaction_cachep, transaction);
+        }
+}
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
+static noinline int join_transaction(struct btrfs_root *root)
+{
+        struct btrfs_transaction *cur_trans;
+        cur_trans = root->fs_info->running_transaction;
+        if (!cur_trans) {
+                cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
+                                             GFP_NOFS);
+                BUG_ON(!cur_trans);
+                root->fs_info->generation++;
+                root->fs_info->last_alloc = 0;
+                root->fs_info->last_data_alloc = 0;
+                cur_trans->num_writers = 1;
+                cur_trans->num_joined = 0;
+                cur_trans->transid = root->fs_info->generation;
+                init_waitqueue_head(&cur_trans->writer_wait);
+                init_waitqueue_head(&cur_trans->commit_wait);
+                cur_trans->in_commit = 0;
+                cur_trans->blocked = 0;
+                cur_trans->use_count = 1;
+                cur_trans->commit_done = 0;
+                cur_trans->start_time = get_seconds();
+                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+                extent_io_tree_init(&cur_trans->dirty_pages,
+                                     root->fs_info->btree_inode->i_mapping,
+                                     GFP_NOFS);
+                spin_lock(&root->fs_info->new_trans_lock);
+                root->fs_info->running_transaction = cur_trans;
+                spin_unlock(&root->fs_info->new_trans_lock);
+        } else {
+                cur_trans->num_writers++;
+                cur_trans->num_joined++;
+        }
+        return 0;
+}
+/*
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction.  This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
+ */
+noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
+{
+        struct btrfs_dirty_root *dirty;
+        u64 running_trans_id = root->fs_info->running_transaction->transid;
+        if (root->ref_cows && root->last_trans < running_trans_id) {
+                WARN_ON(root == root->fs_info->extent_root);
+                if (root->root_item.refs != 0) {
+                        radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                                   (unsigned long)root->root_key.objectid,
+                                   BTRFS_ROOT_TRANS_TAG);
+                        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+                        BUG_ON(!dirty);
+                        dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+                        BUG_ON(!dirty->root);
+                        dirty->latest_root = root;
+                        INIT_LIST_HEAD(&dirty->list);
+                        root->commit_root = btrfs_root_node(root);
+                        memcpy(dirty->root, root, sizeof(*root));
+                        spin_lock_init(&dirty->root->node_lock);
+                        spin_lock_init(&dirty->root->list_lock);
+                        mutex_init(&dirty->root->objectid_mutex);
+                        mutex_init(&dirty->root->log_mutex);
+                        INIT_LIST_HEAD(&dirty->root->dead_list);
+                        dirty->root->node = root->commit_root;
+                        dirty->root->commit_root = NULL;
+                        spin_lock(&root->list_lock);
+                        list_add(&dirty->root->dead_list, &root->dead_list);
+                        spin_unlock(&root->list_lock);
+                        root->dirty_root = dirty;
+                } else {
+                        WARN_ON(1);
+                }
+                root->last_trans = running_trans_id;
+        }
+        return 0;
+}
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+static void wait_current_trans(struct btrfs_root *root)
+{
+        struct btrfs_transaction *cur_trans;
+        cur_trans = root->fs_info->running_transaction;
+        if (cur_trans && cur_trans->blocked) {
+                DEFINE_WAIT(wait);
+                cur_trans->use_count++;
+                while (1) {
+                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        if (cur_trans->blocked) {
+                                mutex_unlock(&root->fs_info->trans_mutex);
+                                schedule();
+                                mutex_lock(&root->fs_info->trans_mutex);
+                                finish_wait(&root->fs_info->transaction_wait,
+                                            &wait);
+                        } else {
+                                finish_wait(&root->fs_info->transaction_wait,
+                                            &wait);
+                                break;
+                        }
+                }
+                put_transaction(cur_trans);
+        }
+}
+static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+                                             int num_blocks, int wait)
+{
+        struct btrfs_trans_handle *h =
+                kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        int ret;
+        mutex_lock(&root->fs_info->trans_mutex);
+        if (!root->fs_info->log_root_recovering &&
+            ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
+                wait_current_trans(root);
+        ret = join_transaction(root);
+        BUG_ON(ret);
+        btrfs_record_root_in_trans(root);
+        h->transid = root->fs_info->running_transaction->transid;
+        h->transaction = root->fs_info->running_transaction;
+        h->blocks_reserved = num_blocks;
+        h->blocks_used = 0;
+        h->block_group = 0;
+        h->alloc_exclude_nr = 0;
+        h->alloc_exclude_start = 0;
+        root->fs_info->running_transaction->use_count++;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        return h;
+}
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+                                                   int num_blocks)
+{
+        return start_transaction(root, num_blocks, 1);
+}
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+                                                   int num_blocks)
+{
+        return start_transaction(root, num_blocks, 0);
+}
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+                                                         int num_blocks)
+{
+        return start_transaction(r, num_blocks, 2);
+}
+/* wait for a transaction commit to be fully complete */
+static noinline int wait_for_commit(struct btrfs_root *root,
+                                    struct btrfs_transaction *commit)
+{
+        DEFINE_WAIT(wait);
+        mutex_lock(&root->fs_info->trans_mutex);
+        while (!commit->commit_done) {
+                prepare_to_wait(&commit->commit_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                if (commit->commit_done)
+                        break;
+                mutex_unlock(&root->fs_info->trans_mutex);
+                schedule();
+                mutex_lock(&root->fs_info->trans_mutex);
+        }
+        mutex_unlock(&root->fs_info->trans_mutex);
+        finish_wait(&commit->commit_wait, &wait);
+        return 0;
+}
+/*
+ * rate limit against the drop_snapshot code.  This helps to slow down new
+ * operations if the drop_snapshot code isn't able to keep up.
+ */
+static void throttle_on_drops(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *info = root->fs_info;
+        int harder_count = 0;
+harder:
+        if (atomic_read(&info->throttles)) {
+                DEFINE_WAIT(wait);
+                int thr;
+                thr = atomic_read(&info->throttle_gen);
+                do {
+                        prepare_to_wait(&info->transaction_throttle,
+                                        &wait, TASK_UNINTERRUPTIBLE);
+                        if (!atomic_read(&info->throttles)) {
+                                finish_wait(&info->transaction_throttle, &wait);
+                                break;
+                        }
+                        schedule();
+                        finish_wait(&info->transaction_throttle, &wait);
+                } while (thr == atomic_read(&info->throttle_gen));
+                harder_count++;
+                if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
+                    harder_count < 2)
+                        goto harder;
+                if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
+                    harder_count < 10)
+                        goto harder;
+                if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
+                    harder_count < 20)
+                        goto harder;
+        }
+}
+void btrfs_throttle(struct btrfs_root *root)
+{
+        mutex_lock(&root->fs_info->trans_mutex);
+        if (!root->fs_info->open_ioctl_trans)
+                wait_current_trans(root);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        throttle_on_drops(root);
+}
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, int throttle)
+{
+        struct btrfs_transaction *cur_trans;
+        struct btrfs_fs_info *info = root->fs_info;
+        mutex_lock(&info->trans_mutex);
+        cur_trans = info->running_transaction;
+        WARN_ON(cur_trans != trans->transaction);
+        WARN_ON(cur_trans->num_writers < 1);
+        cur_trans->num_writers--;
+        if (waitqueue_active(&cur_trans->writer_wait))
+                wake_up(&cur_trans->writer_wait);
+        put_transaction(cur_trans);
+        mutex_unlock(&info->trans_mutex);
+        memset(trans, 0, sizeof(*trans));
+        kmem_cache_free(btrfs_trans_handle_cachep, trans);
+        if (throttle)
+                throttle_on_drops(root);
+        return 0;
+}
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        return __btrfs_end_transaction(trans, root, 0);
+}
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root)
+{
+        return __btrfs_end_transaction(trans, root, 1);
+}
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages)
+{
+        int ret;
+        int err = 0;
+        int werr = 0;
+        struct page *page;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        u64 start = 0;
+        u64 end;
+        unsigned long index;
+        while (1) {
+                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                while (start <= end) {
+                        cond_resched();
+                        index = start >> PAGE_CACHE_SHIFT;
+                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                        page = find_get_page(btree_inode->i_mapping, index);
+                        if (!page)
+                                continue;
+                        btree_lock_page_hook(page);
+                        if (!page->mapping) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                continue;
+                        }
+                        if (PageWriteback(page)) {
+                                if (PageDirty(page))
+                                        wait_on_page_writeback(page);
+                                else {
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        continue;
+                                }
+                        }
+                        err = write_one_page(page, 0);
+                        if (err)
+                                werr = err;
+                        page_cache_release(page);
+                }
+        }
+        while (1) {
+                ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+                while (start <= end) {
+                        index = start >> PAGE_CACHE_SHIFT;
+                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                        page = find_get_page(btree_inode->i_mapping, index);
+                        if (!page)
+                                continue;
+                        if (PageDirty(page)) {
+                                btree_lock_page_hook(page);
+                                wait_on_page_writeback(page);
+                                err = write_one_page(page, 0);
+                                if (err)
+                                        werr = err;
+                        }
+                        wait_on_page_writeback(page);
+                        page_cache_release(page);
+                        cond_resched();
+                }
+        }
+        if (err)
+                werr = err;
+        return werr;
+}
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root)
+{
+        if (!trans || !trans->transaction) {
+                struct inode *btree_inode;
+                btree_inode = root->fs_info->btree_inode;
+                return filemap_write_and_wait(btree_inode->i_mapping);
+        }
+        return btrfs_write_and_wait_marked_extents(root,
+                                           &trans->transaction->dirty_pages);
+}
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+        int ret;
+        u64 old_root_bytenr;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
+        btrfs_extent_post_op(trans, root);
+        btrfs_write_dirty_block_groups(trans, root);
+        btrfs_extent_post_op(trans, root);
+        while (1) {
+                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+                if (old_root_bytenr == root->node->start)
+                        break;
+                btrfs_set_root_bytenr(&root->root_item,
+                                       root->node->start);
+                btrfs_set_root_level(&root->root_item,
+                                     btrfs_header_level(root->node));
+                btrfs_set_root_generation(&root->root_item, trans->transid);
+                btrfs_extent_post_op(trans, root);
+                ret = btrfs_update_root(trans, tree_root,
+                                        &root->root_key,
+                                        &root->root_item);
+                BUG_ON(ret);
+                btrfs_write_dirty_block_groups(trans, root);
+                btrfs_extent_post_op(trans, root);
+        }
+        return 0;
+}
+/*
+ * update all the cowonly tree roots on disk
+ */
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct list_head *next;
+        struct extent_buffer *eb;
+        btrfs_extent_post_op(trans, fs_info->tree_root);
+        eb = btrfs_lock_root_node(fs_info->tree_root);
+        btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+        btrfs_tree_unlock(eb);
+        free_extent_buffer(eb);
+        btrfs_extent_post_op(trans, fs_info->tree_root);
+        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+                next = fs_info->dirty_cowonly_roots.next;
+                list_del_init(next);
+                root = list_entry(next, struct btrfs_root, dirty_list);
+                update_cowonly_root(trans, root);
+        }
+        return 0;
+}
+/*
+ * dead roots are old snapshots that need to be deleted.  This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
+{
+        struct btrfs_dirty_root *dirty;
+        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+        if (!dirty)
+                return -ENOMEM;
+        dirty->root = root;
+        dirty->latest_root = latest;
+        mutex_lock(&root->fs_info->trans_mutex);
+        list_add(&dirty->list, &latest->fs_info->dead_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        return 0;
+}
+/*
+ * at transaction commit time we need to schedule the old roots for
+ * deletion via btrfs_drop_snapshot.  This runs through all the
+ * reference counted roots that were modified in the current
+ * transaction and puts them into the drop list
+ */
+static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
+                                    struct radix_tree_root *radix,
+                                    struct list_head *list)
+{
+        struct btrfs_dirty_root *dirty;
+        struct btrfs_root *gang[8];
+        struct btrfs_root *root;
+        int i;
+        int ret;
+        int err = 0;
+        u32 refs;
+        while (1) {
+                ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+                                                 ARRAY_SIZE(gang),
+                                                 BTRFS_ROOT_TRANS_TAG);
+                if (ret == 0)
+                        break;
+                for (i = 0; i < ret; i++) {
+                        root = gang[i];
+                        radix_tree_tag_clear(radix,
+                                     (unsigned long)root->root_key.objectid,
+                                     BTRFS_ROOT_TRANS_TAG);
+                        BUG_ON(!root->ref_tree);
+                        dirty = root->dirty_root;
+                        btrfs_free_log(trans, root);
+                        btrfs_free_reloc_root(trans, root);
+                        if (root->commit_root == root->node) {
+                                WARN_ON(root->node->start !=
+                                        btrfs_root_bytenr(&root->root_item));
+                                free_extent_buffer(root->commit_root);
+                                root->commit_root = NULL;
+                                root->dirty_root = NULL;
+                                spin_lock(&root->list_lock);
+                                list_del_init(&dirty->root->dead_list);
+                                spin_unlock(&root->list_lock);
+                                kfree(dirty->root);
+                                kfree(dirty);
+                                /* make sure to update the root on disk
+                                 * so we get any updates to the block used
+                                 * counts
+                                 */
+                                err = btrfs_update_root(trans,
+                                                root->fs_info->tree_root,
+                                                &root->root_key,
+                                                &root->root_item);
+                                continue;
+                        }
+                        memset(&root->root_item.drop_progress, 0,
+                               sizeof(struct btrfs_disk_key));
+                        root->root_item.drop_level = 0;
+                        root->commit_root = NULL;
+                        root->dirty_root = NULL;
+                        root->root_key.offset = root->fs_info->generation;
+                        btrfs_set_root_bytenr(&root->root_item,
+                                              root->node->start);
+                        btrfs_set_root_level(&root->root_item,
+                                             btrfs_header_level(root->node));
+                        btrfs_set_root_generation(&root->root_item,
+                                                  root->root_key.offset);
+                        err = btrfs_insert_root(trans, root->fs_info->tree_root,
+                                                &root->root_key,
+                                                &root->root_item);
+                        if (err)
+                                break;
+                        refs = btrfs_root_refs(&dirty->root->root_item);
+                        btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
+                        err = btrfs_update_root(trans, root->fs_info->tree_root,
+                                                &dirty->root->root_key,
+                                                &dirty->root->root_item);
+                        BUG_ON(err);
+                        if (refs == 1) {
+                                list_add(&dirty->list, list);
+                        } else {
+                                WARN_ON(1);
+                                free_extent_buffer(dirty->root->node);
+                                kfree(dirty->root);
+                                kfree(dirty);
+                        }
+                }
+        }
+        return err;
+}
+/*
+ * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
+ * otherwise every leaf in the btree is read and defragged.
+ */
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+{
+        struct btrfs_fs_info *info = root->fs_info;
+        int ret;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr;
+        smp_mb();
+        if (root->defrag_running)
+                return 0;
+        trans = btrfs_start_transaction(root, 1);
+        while (1) {
+                root->defrag_running = 1;
+                ret = btrfs_defrag_leaves(trans, root, cacheonly);
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, root);
+                btrfs_btree_balance_dirty(info->tree_root, nr);
+                cond_resched();
+                trans = btrfs_start_transaction(root, 1);
+                if (root->fs_info->closing || ret != -EAGAIN)
+                        break;
+        }
+        root->defrag_running = 0;
+        smp_mb();
+        btrfs_end_transaction(trans, root);
+        return 0;
+}
+/*
+ * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
+ * all of them
+ */
+static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
+                                     struct list_head *list)
+{
+        struct btrfs_dirty_root *dirty;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr;
+        u64 num_bytes;
+        u64 bytes_used;
+        u64 max_useless;
+        int ret = 0;
+        int err;
+        while (!list_empty(list)) {
+                struct btrfs_root *root;
+                dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
+                list_del_init(&dirty->list);
+                num_bytes = btrfs_root_used(&dirty->root->root_item);
+                root = dirty->latest_root;
+                atomic_inc(&root->fs_info->throttles);
+                while (1) {
+                        trans = btrfs_start_transaction(tree_root, 1);
+                        mutex_lock(&root->fs_info->drop_mutex);
+                        ret = btrfs_drop_snapshot(trans, dirty->root);
+                        if (ret != -EAGAIN)
+                                break;
+                        mutex_unlock(&root->fs_info->drop_mutex);
+                        err = btrfs_update_root(trans,
+                                        tree_root,
+                                        &dirty->root->root_key,
+                                        &dirty->root->root_item);
+                        if (err)
+                                ret = err;
+                        nr = trans->blocks_used;
+                        ret = btrfs_end_transaction(trans, tree_root);
+                        BUG_ON(ret);
+                        btrfs_btree_balance_dirty(tree_root, nr);
+                        cond_resched();
+                }
+                BUG_ON(ret);
+                atomic_dec(&root->fs_info->throttles);
+                wake_up(&root->fs_info->transaction_throttle);
+                num_bytes -= btrfs_root_used(&dirty->root->root_item);
+                bytes_used = btrfs_root_used(&root->root_item);
+                if (num_bytes) {
+                        btrfs_record_root_in_trans(root);
+                        btrfs_set_root_used(&root->root_item,
+                                            bytes_used - num_bytes);
+                }
+                ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
+                if (ret) {
+                        BUG();
+                        break;
+                }
+                mutex_unlock(&root->fs_info->drop_mutex);
+                spin_lock(&root->list_lock);
+                list_del_init(&dirty->root->dead_list);
+                if (!list_empty(&root->dead_list)) {
+                        struct btrfs_root *oldest;
+                        oldest = list_entry(root->dead_list.prev,
+                                            struct btrfs_root, dead_list);
+                        max_useless = oldest->root_key.offset - 1;
+                } else {
+                        max_useless = root->root_key.offset - 1;
+                }
+                spin_unlock(&root->list_lock);
+                nr = trans->blocks_used;
+                ret = btrfs_end_transaction(trans, tree_root);
+                BUG_ON(ret);
+                ret = btrfs_remove_leaf_refs(root, max_useless, 0);
+                BUG_ON(ret);
+                free_extent_buffer(dirty->root->node);
+                kfree(dirty->root);
+                kfree(dirty);
+                btrfs_btree_balance_dirty(tree_root, nr);
+                cond_resched();
+        }
+        return ret;
+}
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit.  This does the actual creation
+ */
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_info *fs_info,
+                                   struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_key key;
+        struct btrfs_root_item *new_root_item;
+        struct btrfs_root *tree_root = fs_info->tree_root;
+        struct btrfs_root *root = pending->root;
+        struct extent_buffer *tmp;
+        struct extent_buffer *old;
+        int ret;
+        u64 objectid;
+        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
+        if (!new_root_item) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+        if (ret)
+                goto fail;
+        btrfs_record_root_in_trans(root);
+        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+        key.objectid = objectid;
+        key.offset = trans->transid;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        old = btrfs_lock_root_node(root);
+        btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+        btrfs_copy_root(trans, root, old, &tmp, objectid);
+        btrfs_tree_unlock(old);
+        free_extent_buffer(old);
+        btrfs_set_root_bytenr(new_root_item, tmp->start);
+        btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
+        btrfs_set_root_generation(new_root_item, trans->transid);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+                                new_root_item);
+        btrfs_tree_unlock(tmp);
+        free_extent_buffer(tmp);
+        if (ret)
+                goto fail;
+        key.offset = (u64)-1;
+        memcpy(&pending->root_key, &key, sizeof(key));
+fail:
+        kfree(new_root_item);
+        return ret;
+}
+static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_pending_snapshot *pending)
+{
+        int ret;
+        int namelen;
+        u64 index = 0;
+        struct btrfs_trans_handle *trans;
+        struct inode *parent_inode;
+        struct inode *inode;
+        struct btrfs_root *parent_root;
+        parent_inode = pending->dentry->d_parent->d_inode;
+        parent_root = BTRFS_I(parent_inode)->root;
+        trans = btrfs_join_transaction(parent_root, 1);
+        /*
+         * insert the directory item
+         */
+        namelen = strlen(pending->name);
+        ret = btrfs_set_inode_index(parent_inode, &index);
+        ret = btrfs_insert_dir_item(trans, parent_root,
+                            pending->name, namelen,
+                            parent_inode->i_ino,
+                            &pending->root_key, BTRFS_FT_DIR, index);
+        if (ret)
+                goto fail;
+        btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+        ret = btrfs_update_inode(trans, parent_root, parent_inode);
+        BUG_ON(ret);
+        /* add the backref first */
+        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+                                 pending->root_key.objectid,
+                                 BTRFS_ROOT_BACKREF_KEY,
+                                 parent_root->root_key.objectid,
+                                 parent_inode->i_ino, index, pending->name,
+                                 namelen);
+        BUG_ON(ret);
+        /* now add the forward ref */
+        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+                                 parent_root->root_key.objectid,
+                                 BTRFS_ROOT_REF_KEY,
+                                 pending->root_key.objectid,
+                                 parent_inode->i_ino, index, pending->name,
+                                 namelen);
+        inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
+        d_instantiate(pending->dentry, inode);
+fail:
+        btrfs_end_transaction(trans, fs_info->fs_root);
+        return ret;
+}
+/*
+ * create all the snapshots we've scheduled for creation
+ */
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
+                                             struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_pending_snapshot *pending;
+        struct list_head *head = &trans->transaction->pending_snapshots;
+        struct list_head *cur;
+        int ret;
+        list_for_each(cur, head) {
+                pending = list_entry(cur, struct btrfs_pending_snapshot, list);
+                ret = create_pending_snapshot(trans, fs_info, pending);
+                BUG_ON(ret);
+        }
+        return 0;
+}
+static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
+                                             struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_pending_snapshot *pending;
+        struct list_head *head = &trans->transaction->pending_snapshots;
+        int ret;
+        while (!list_empty(head)) {
+                pending = list_entry(head->next,
+                                     struct btrfs_pending_snapshot, list);
+                ret = finish_pending_snapshot(fs_info, pending);
+                BUG_ON(ret);
+                list_del(&pending->list);
+                kfree(pending->name);
+                kfree(pending);
+        }
+        return 0;
+}
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root)
+{
+        unsigned long joined = 0;
+        unsigned long timeout = 1;
+        struct btrfs_transaction *cur_trans;
+        struct btrfs_transaction *prev_trans = NULL;
+        struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+        struct list_head dirty_fs_roots;
+        struct extent_io_tree *pinned_copy;
+        DEFINE_WAIT(wait);
+        int ret;
+        INIT_LIST_HEAD(&dirty_fs_roots);
+        mutex_lock(&root->fs_info->trans_mutex);
+        if (trans->transaction->in_commit) {
+                cur_trans = trans->transaction;
+                trans->transaction->use_count++;
+                mutex_unlock(&root->fs_info->trans_mutex);
+                btrfs_end_transaction(trans, root);
+                ret = wait_for_commit(root, cur_trans);
+                BUG_ON(ret);
+                mutex_lock(&root->fs_info->trans_mutex);
+                put_transaction(cur_trans);
+                mutex_unlock(&root->fs_info->trans_mutex);
+                return 0;
+        }
+        pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
+        if (!pinned_copy)
+                return -ENOMEM;
+        extent_io_tree_init(pinned_copy,
+                             root->fs_info->btree_inode->i_mapping, GFP_NOFS);
+        trans->transaction->in_commit = 1;
+        trans->transaction->blocked = 1;
+        cur_trans = trans->transaction;
+        if (cur_trans->list.prev != &root->fs_info->trans_list) {
+                prev_trans = list_entry(cur_trans->list.prev,
+                                        struct btrfs_transaction, list);
+                if (!prev_trans->commit_done) {
+                        prev_trans->use_count++;
+                        mutex_unlock(&root->fs_info->trans_mutex);
+                        wait_for_commit(root, prev_trans);
+                        mutex_lock(&root->fs_info->trans_mutex);
+                        put_transaction(prev_trans);
+                }
+        }
+        do {
+                int snap_pending = 0;
+                joined = cur_trans->num_joined;
+                if (!list_empty(&trans->transaction->pending_snapshots))
+                        snap_pending = 1;
+                WARN_ON(cur_trans != trans->transaction);
+                prepare_to_wait(&cur_trans->writer_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                if (cur_trans->num_writers > 1)
+                        timeout = MAX_SCHEDULE_TIMEOUT;
+                else
+                        timeout = 1;
+                mutex_unlock(&root->fs_info->trans_mutex);
+                if (snap_pending) {
+                        ret = btrfs_wait_ordered_extents(root, 1);
+                        BUG_ON(ret);
+                }
+                schedule_timeout(timeout);
+                mutex_lock(&root->fs_info->trans_mutex);
+                finish_wait(&cur_trans->writer_wait, &wait);
+        } while (cur_trans->num_writers > 1 ||
+                 (cur_trans->num_joined != joined));
+        ret = create_pending_snapshots(trans, root->fs_info);
+        BUG_ON(ret);
+        WARN_ON(cur_trans != trans->transaction);
+        /* btrfs_commit_tree_roots is responsible for getting the
+         * various roots consistent with each other.  Every pointer
+         * in the tree of tree roots has to point to the most up to date
+         * root for every subvolume and other tree.  So, we have to keep
+         * the tree logging code from jumping in and changing any
+         * of the trees.
+         *
+         * At this point in the commit, there can't be any tree-log
+         * writers, but a little lower down we drop the trans mutex
+         * and let new people in.  By holding the tree_log_mutex
+         * from now until after the super is written, we avoid races
+         * with the tree-log code.
+         */
+        mutex_lock(&root->fs_info->tree_log_mutex);
+        /*
+         * keep tree reloc code from adding new reloc trees
+         */
+        mutex_lock(&root->fs_info->tree_reloc_mutex);
+        ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
+                              &dirty_fs_roots);
+        BUG_ON(ret);
+        /* add_dirty_roots gets rid of all the tree log roots, it is now
+         * safe to free the root of tree log roots
+         */
+        btrfs_free_log_root_tree(trans, root->fs_info);
+        ret = btrfs_commit_tree_roots(trans, root);
+        BUG_ON(ret);
+        cur_trans = root->fs_info->running_transaction;
+        spin_lock(&root->fs_info->new_trans_lock);
+        root->fs_info->running_transaction = NULL;
+        spin_unlock(&root->fs_info->new_trans_lock);
+        btrfs_set_super_generation(&root->fs_info->super_copy,
+                                   cur_trans->transid);
+        btrfs_set_super_root(&root->fs_info->super_copy,
+                             root->fs_info->tree_root->node->start);
+        btrfs_set_super_root_level(&root->fs_info->super_copy,
+                           btrfs_header_level(root->fs_info->tree_root->node));
+        btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+                                   chunk_root->node->start);
+        btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+                                         btrfs_header_level(chunk_root->node));
+        btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
+                                btrfs_header_generation(chunk_root->node));
+        if (!root->fs_info->log_root_recovering) {
+                btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+                btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+        }
+        memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+               sizeof(root->fs_info->super_copy));
+        btrfs_copy_pinned(root, pinned_copy);
+        trans->transaction->blocked = 0;
+        wake_up(&root->fs_info->transaction_throttle);
+        wake_up(&root->fs_info->transaction_wait);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        ret = btrfs_write_and_wait_transaction(trans, root);
+        BUG_ON(ret);
+        write_ctree_super(trans, root, 0);
+        /*
+         * the super is written, we can safely allow the tree-loggers
+         * to go about their business
+         */
+        mutex_unlock(&root->fs_info->tree_log_mutex);
+        btrfs_finish_extent_commit(trans, root, pinned_copy);
+        kfree(pinned_copy);
+        btrfs_drop_dead_reloc_roots(root);
+        mutex_unlock(&root->fs_info->tree_reloc_mutex);
+        /* do the directory inserts of any pending snapshot creations */
+        finish_pending_snapshots(trans, root->fs_info);
+        mutex_lock(&root->fs_info->trans_mutex);
+        cur_trans->commit_done = 1;
+        root->fs_info->last_trans_committed = cur_trans->transid;
+        wake_up(&cur_trans->commit_wait);
+        put_transaction(cur_trans);
+        put_transaction(cur_trans);
+        list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
+        if (root->fs_info->closing)
+                list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        kmem_cache_free(btrfs_trans_handle_cachep, trans);
+        if (root->fs_info->closing)
+                drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
+        return ret;
+}
+/*
+ * interface function to delete all the snapshots we have scheduled for deletion
+ */
+int btrfs_clean_old_snapshots(struct btrfs_root *root)
+{
+        struct list_head dirty_roots;
+        INIT_LIST_HEAD(&dirty_roots);
+again:
+        mutex_lock(&root->fs_info->trans_mutex);
+        list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        if (!list_empty(&dirty_roots)) {
+                drop_dirty_roots(root, &dirty_roots);
+                goto again;
+        }
+        return 0;
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..ea292117f882
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
+#include "btrfs_inode.h"
+struct btrfs_transaction {
+        u64 transid;
+        unsigned long num_writers;
+        unsigned long num_joined;
+        int in_commit;
+        int use_count;
+        int commit_done;
+        int blocked;
+        struct list_head list;
+        struct extent_io_tree dirty_pages;
+        unsigned long start_time;
+        wait_queue_head_t writer_wait;
+        wait_queue_head_t commit_wait;
+        struct list_head pending_snapshots;
+};
+struct btrfs_trans_handle {
+        u64 transid;
+        unsigned long blocks_reserved;
+        unsigned long blocks_used;
+        struct btrfs_transaction *transaction;
+        u64 block_group;
+        u64 alloc_exclude_start;
+        u64 alloc_exclude_nr;
+};
+struct btrfs_pending_snapshot {
+        struct dentry *dentry;
+        struct btrfs_root *root;
+        char *name;
+        struct btrfs_key root_key;
+        struct list_head list;
+};
+struct btrfs_dirty_root {
+        struct list_head list;
+        struct btrfs_root *root;
+        struct btrfs_root *latest_root;
+};
+static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
+                                               struct inode *inode)
+{
+        trans->block_group = BTRFS_I(inode)->block_group;
+}
+static inline void btrfs_update_inode_block_group(
+                                          struct btrfs_trans_handle *trans,
+                                          struct inode *inode)
+{
+        BTRFS_I(inode)->block_group = trans->block_group;
+}
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+                                              struct inode *inode)
+{
+        BTRFS_I(inode)->last_trans = trans->transaction->transid;
+}
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+                                                   int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+                                                   int num_blocks);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+                                                   int num_blocks);
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root);
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root);
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_clean_old_snapshots(struct btrfs_root *root);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root);
+void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages);
+#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..3e8358c36165
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
+ * things from disk, otherwise read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, int cache_only)
+{
+        struct btrfs_path *path = NULL;
+        struct btrfs_key key;
+        int ret = 0;
+        int wret;
+        int level;
+        int orig_level;
+        int is_extent = 0;
+        int next_key_ret = 0;
+        u64 last_ret = 0;
+        u64 min_trans = 0;
+        if (cache_only)
+                goto out;
+        if (root->fs_info->extent_root == root) {
+                /*
+                 * there's recursion here right now in the tree locking,
+                 * we can't defrag the extent root without deadlock
+                 */
+                goto out;
+        }
+        if (root->ref_cows == 0 && !is_extent)
+                goto out;
+        if (btrfs_test_opt(root, SSD))
+                goto out;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        level = btrfs_header_level(root->node);
+        orig_level = level;
+        if (level == 0)
+                goto out;
+        if (root->defrag_progress.objectid == 0) {
+                struct extent_buffer *root_node;
+                u32 nritems;
+                root_node = btrfs_lock_root_node(root);
+                nritems = btrfs_header_nritems(root_node);
+                root->defrag_max.objectid = 0;
+                /* from above we know this is not a leaf */
+                btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+                                      nritems - 1);
+                btrfs_tree_unlock(root_node);
+                free_extent_buffer(root_node);
+                memset(&key, 0, sizeof(key));
+        } else {
+                memcpy(&key, &root->defrag_progress, sizeof(key));
+        }
+        path->keep_locks = 1;
+        if (cache_only)
+                min_trans = root->defrag_trans_start;
+        ret = btrfs_search_forward(root, &key, NULL, path,
+                                   cache_only, min_trans);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = 0;
+                goto out;
+        }
+        btrfs_release_path(root, path);
+        wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+        if (wret < 0) {
+                ret = wret;
+                goto out;
+        }
+        if (!path->nodes[1]) {
+                ret = 0;
+                goto out;
+        }
+        path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+        next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+                                           min_trans);
+        ret = btrfs_realloc_node(trans, root,
+                                 path->nodes[1], 0,
+                                 cache_only, &last_ret,
+                                 &root->defrag_progress);
+        WARN_ON(ret && ret != -EAGAIN);
+        if (next_key_ret == 0) {
+                memcpy(&root->defrag_progress, &key, sizeof(key));
+                ret = -EAGAIN;
+        }
+        btrfs_release_path(root, path);
+        if (is_extent)
+                btrfs_extent_post_op(trans, root);
+out:
+        if (path)
+                btrfs_free_path(path);
+        if (ret == -EAGAIN) {
+                if (root->defrag_max.objectid > root->defrag_progress.objectid)
+                        goto done;
+                if (root->defrag_max.type > root->defrag_progress.type)
+                        goto done;
+                if (root->defrag_max.offset > root->defrag_progress.offset)
+                        goto done;
+                ret = 0;
+        }
+done:
+        if (ret != -EAGAIN) {
+                memset(&root->defrag_progress, 0,
+                       sizeof(root->defrag_progress));
+                root->defrag_trans_start = trans->transid;
+        }
+        return ret;
+}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..d81cda2e077c
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2898 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "compat.h"
+#include "tree-log.h"
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+/*
+ * stages for the tree walking.  The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_ALL 2
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, struct inode *inode,
+                             int inode_only);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid);
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction.  Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree.  Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+/*
+ * btrfs_add_log_tree adds a new per-subvolume log tree into the
+ * tree of log tree roots.  This must be called with a tree log transaction
+ * running (see start_log_trans).
+ */
+static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root)
+{
+        struct btrfs_key key;
+        struct btrfs_root_item root_item;
+        struct btrfs_inode_item *inode_item;
+        struct extent_buffer *leaf;
+        struct btrfs_root *new_root = root;
+        int ret;
+        u64 objectid = root->root_key.objectid;
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+                                      BTRFS_TREE_LOG_OBJECTID,
+                                      trans->transid, 0, 0, 0);
+        if (IS_ERR(leaf)) {
+                ret = PTR_ERR(leaf);
+                return ret;
+        }
+        btrfs_set_header_nritems(leaf, 0);
+        btrfs_set_header_level(leaf, 0);
+        btrfs_set_header_bytenr(leaf, leaf->start);
+        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+        write_extent_buffer(leaf, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(leaf),
+                            BTRFS_FSID_SIZE);
+        btrfs_mark_buffer_dirty(leaf);
+        inode_item = &root_item.inode;
+        memset(inode_item, 0, sizeof(*inode_item));
+        inode_item->generation = cpu_to_le64(1);
+        inode_item->size = cpu_to_le64(3);
+        inode_item->nlink = cpu_to_le32(1);
+        inode_item->nbytes = cpu_to_le64(root->leafsize);
+        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+        btrfs_set_root_bytenr(&root_item, leaf->start);
+        btrfs_set_root_generation(&root_item, trans->transid);
+        btrfs_set_root_level(&root_item, 0);
+        btrfs_set_root_refs(&root_item, 0);
+        btrfs_set_root_used(&root_item, 0);
+        memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+        root_item.drop_level = 0;
+        btrfs_tree_unlock(leaf);
+        free_extent_buffer(leaf);
+        leaf = NULL;
+        btrfs_set_root_dirid(&root_item, 0);
+        key.objectid = BTRFS_TREE_LOG_OBJECTID;
+        key.offset = objectid;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
+                                &root_item);
+        if (ret)
+                goto fail;
+        new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
+                                               &key);
+        BUG_ON(!new_root);
+        WARN_ON(root->log_root);
+        root->log_root = new_root;
+        /*
+         * log trees do not get reference counted because they go away
+         * before a real commit is actually done.  They do store pointers
+         * to file data extents, and those reference counts still get
+         * updated (along with back refs to the log tree).
+         */
+        new_root->ref_cows = 0;
+        new_root->last_trans = trans->transid;
+        /*
+         * we need to make sure the root block for this new tree
+         * is marked as dirty in the dirty_log_pages tree.  This
+         * is how it gets flushed down to disk at tree log commit time.
+         *
+         * the tree logging mutex keeps others from coming in and changing
+         * the new_root->node, so we can safely access it here
+         */
+        set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
+                         new_root->node->start + new_root->node->len - 1,
+                         GFP_NOFS);
+fail:
+        return ret;
+}
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root)
+{
+        int ret;
+        mutex_lock(&root->fs_info->tree_log_mutex);
+        if (!root->fs_info->log_root_tree) {
+                ret = btrfs_init_log_root_tree(trans, root->fs_info);
+                BUG_ON(ret);
+        }
+        if (!root->log_root) {
+                ret = btrfs_add_log_tree(trans, root);
+                BUG_ON(ret);
+        }
+        atomic_inc(&root->fs_info->tree_log_writers);
+        root->fs_info->tree_log_batch++;
+        mutex_unlock(&root->fs_info->tree_log_mutex);
+        return 0;
+}
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+        int ret = -ENOENT;
+        smp_mb();
+        if (!root->log_root)
+                return -ENOENT;
+        mutex_lock(&root->fs_info->tree_log_mutex);
+        if (root->log_root) {
+                ret = 0;
+                atomic_inc(&root->fs_info->tree_log_writers);
+                root->fs_info->tree_log_batch++;
+        }
+        mutex_unlock(&root->fs_info->tree_log_mutex);
+        return ret;
+}
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+static int end_log_trans(struct btrfs_root *root)
+{
+        atomic_dec(&root->fs_info->tree_log_writers);
+        smp_mb();
+        if (waitqueue_active(&root->fs_info->tree_log_wait))
+                wake_up(&root->fs_info->tree_log_wait);
+        return 0;
+}
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree.  The stage field tells us which part
+ * of the log tree processing we are currently doing.  The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+        /* should we free the extent on disk when done?  This is used
+         * at transaction commit time while freeing a log tree
+         */
+        int free;
+        /* should we write out the extent buffer?  This is used
+         * while flushing the log tree to disk during a sync
+         */
+        int write;
+        /* should we wait for the extent buffer io to finish?  Also used
+         * while flushing the log tree to disk for a sync
+         */
+        int wait;
+        /* pin only walk, we record which extents on disk belong to the
+         * log trees
+         */
+        int pin;
+        /* what stage of the replay code we're currently in */
+        int stage;
+        /* the root we are currently replaying */
+        struct btrfs_root *replay_dest;
+        /* the trans handle for the current replay */
+        struct btrfs_trans_handle *trans;
+        /* the function that gets used to process blocks we find in the
+         * tree.  Note the extent_buffer might not be up to date when it is
+         * passed in, and it must be checked or read if you need the data
+         * inside it
+         */
+        int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+                            struct walk_control *wc, u64 gen);
+};
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+                              struct extent_buffer *eb,
+                              struct walk_control *wc, u64 gen)
+{
+        if (wc->pin) {
+                mutex_lock(&log->fs_info->pinned_mutex);
+                btrfs_update_pinned_extents(log->fs_info->extent_root,
+                                            eb->start, eb->len, 1);
+                mutex_unlock(&log->fs_info->pinned_mutex);
+        }
+        if (btrfs_buffer_uptodate(eb, gen)) {
+                if (wc->write)
+                        btrfs_write_tree_block(eb);
+                if (wc->wait)
+                        btrfs_wait_tree_block_writeback(eb);
+        }
+        return 0;
+}
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path,
+                                   struct extent_buffer *eb, int slot,
+                                   struct btrfs_key *key)
+{
+        int ret;
+        u32 item_size;
+        u64 saved_i_size = 0;
+        int save_old_i_size = 0;
+        unsigned long src_ptr;
+        unsigned long dst_ptr;
+        int overwrite_root = 0;
+        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+                overwrite_root = 1;
+        item_size = btrfs_item_size_nr(eb, slot);
+        src_ptr = btrfs_item_ptr_offset(eb, slot);
+        /* look for the key in the destination tree */
+        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+        if (ret == 0) {
+                char *src_copy;
+                char *dst_copy;
+                u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+                                                  path->slots[0]);
+                if (dst_size != item_size)
+                        goto insert;
+                if (item_size == 0) {
+                        btrfs_release_path(root, path);
+                        return 0;
+                }
+                dst_copy = kmalloc(item_size, GFP_NOFS);
+                src_copy = kmalloc(item_size, GFP_NOFS);
+                read_extent_buffer(eb, src_copy, src_ptr, item_size);
+                dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+                read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+                                   item_size);
+                ret = memcmp(dst_copy, src_copy, item_size);
+                kfree(dst_copy);
+                kfree(src_copy);
+                /*
+                 * they have the same contents, just return, this saves
+                 * us from cowing blocks in the destination tree and doing
+                 * extra writes that may not have been done by a previous
+                 * sync
+                 */
+                if (ret == 0) {
+                        btrfs_release_path(root, path);
+                        return 0;
+                }
+        }
+insert:
+        btrfs_release_path(root, path);
+        /* try to insert the key into the destination tree */
+        ret = btrfs_insert_empty_item(trans, root, path,
+                                      key, item_size);
+        /* make sure any existing item is the correct size */
+        if (ret == -EEXIST) {
+                u32 found_size;
+                found_size = btrfs_item_size_nr(path->nodes[0],
+                                                path->slots[0]);
+                if (found_size > item_size) {
+                        btrfs_truncate_item(trans, root, path, item_size, 1);
+                } else if (found_size < item_size) {
+                        ret = btrfs_extend_item(trans, root, path,
+                                                item_size - found_size);
+                        BUG_ON(ret);
+                }
+        } else if (ret) {
+                BUG();
+        }
+        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+                                        path->slots[0]);
+        /* don't overwrite an existing inode if the generation number
+         * was logged as zero.  This is done when the tree logging code
+         * is just logging an inode to make sure it exists after recovery.
+         *
+         * Also, don't overwrite i_size on directories during replay.
+         * log replay inserts and removes directory items based on the
+         * state of the tree found in the subvolume, and i_size is modified
+         * as it goes
+         */
+        if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+                struct btrfs_inode_item *src_item;
+                struct btrfs_inode_item *dst_item;
+                src_item = (struct btrfs_inode_item *)src_ptr;
+                dst_item = (struct btrfs_inode_item *)dst_ptr;
+                if (btrfs_inode_generation(eb, src_item) == 0)
+                        goto no_copy;
+                if (overwrite_root &&
+                    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+                    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+                        save_old_i_size = 1;
+                        saved_i_size = btrfs_inode_size(path->nodes[0],
+                                                        dst_item);
+                }
+        }
+        copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+                           src_ptr, item_size);
+        if (save_old_i_size) {
+                struct btrfs_inode_item *dst_item;
+                dst_item = (struct btrfs_inode_item *)dst_ptr;
+                btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+        }
+        /* make sure the generation is filled in */
+        if (key->type == BTRFS_INODE_ITEM_KEY) {
+                struct btrfs_inode_item *dst_item;
+                dst_item = (struct btrfs_inode_item *)dst_ptr;
+                if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+                        btrfs_set_inode_generation(path->nodes[0], dst_item,
+                                                   trans->transid);
+                }
+        }
+no_copy:
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(root, path);
+        return 0;
+}
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+                                             u64 objectid)
+{
+        struct inode *inode;
+        inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+        if (inode->i_state & I_NEW) {
+                BTRFS_I(inode)->root = root;
+                BTRFS_I(inode)->location.objectid = objectid;
+                BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+                BTRFS_I(inode)->location.offset = 0;
+                btrfs_read_locked_inode(inode);
+                unlock_new_inode(inode);
+        }
+        if (is_bad_inode(inode)) {
+                iput(inode);
+                inode = NULL;
+        }
+        return inode;
+}
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'.  path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet.  So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct extent_buffer *eb, int slot,
+                                      struct btrfs_key *key)
+{
+        int found_type;
+        u64 mask = root->sectorsize - 1;
+        u64 extent_end;
+        u64 alloc_hint;
+        u64 start = key->offset;
+        u64 saved_nbytes;
+        struct btrfs_file_extent_item *item;
+        struct inode *inode = NULL;
+        unsigned long size;
+        int ret = 0;
+        item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+        found_type = btrfs_file_extent_type(eb, item);
+        if (found_type == BTRFS_FILE_EXTENT_REG ||
+            found_type == BTRFS_FILE_EXTENT_PREALLOC)
+                extent_end = start + btrfs_file_extent_num_bytes(eb, item);
+        else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                size = btrfs_file_extent_inline_len(eb, item);
+                extent_end = (start + size + mask) & ~mask;
+        } else {
+                ret = 0;
+                goto out;
+        }
+        inode = read_one_inode(root, key->objectid);
+        if (!inode) {
+                ret = -EIO;
+                goto out;
+        }
+        /*
+         * first check to see if we already have this extent in the
+         * file.  This must be done before the btrfs_drop_extents run
+         * so we don't try to drop this extent.
+         */
+        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                       start, 0);
+        if (ret == 0 &&
+            (found_type == BTRFS_FILE_EXTENT_REG ||
+             found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+                struct btrfs_file_extent_item cmp1;
+                struct btrfs_file_extent_item cmp2;
+                struct btrfs_file_extent_item *existing;
+                struct extent_buffer *leaf;
+                leaf = path->nodes[0];
+                existing = btrfs_item_ptr(leaf, path->slots[0],
+                                          struct btrfs_file_extent_item);
+                read_extent_buffer(eb, &cmp1, (unsigned long)item,
+                                   sizeof(cmp1));
+                read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+                                   sizeof(cmp2));
+                /*
+                 * we already have a pointer to this exact extent,
+                 * we don't have to do anything
+                 */
+                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+                        btrfs_release_path(root, path);
+                        goto out;
+                }
+        }
+        btrfs_release_path(root, path);
+        saved_nbytes = inode_get_bytes(inode);
+        /* drop any overlapping extents */
+        ret = btrfs_drop_extents(trans, root, inode,
+                         start, extent_end, start, &alloc_hint);
+        BUG_ON(ret);
+        if (found_type == BTRFS_FILE_EXTENT_REG ||
+            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                unsigned long dest_offset;
+                struct btrfs_key ins;
+                ret = btrfs_insert_empty_item(trans, root, path, key,
+                                              sizeof(*item));
+                BUG_ON(ret);
+                dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+                                                    path->slots[0]);
+                copy_extent_buffer(path->nodes[0], eb, dest_offset,
+                                (unsigned long)item,  sizeof(*item));
+                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+                ins.type = BTRFS_EXTENT_ITEM_KEY;
+                if (ins.objectid > 0) {
+                        u64 csum_start;
+                        u64 csum_end;
+                        LIST_HEAD(ordered_sums);
+                        /*
+                         * is this extent already allocated in the extent
+                         * allocation tree?  If so, just add a reference
+                         */
+                        ret = btrfs_lookup_extent(root, ins.objectid,
+                                                ins.offset);
+                        if (ret == 0) {
+                                ret = btrfs_inc_extent_ref(trans, root,
+                                                ins.objectid, ins.offset,
+                                                path->nodes[0]->start,
+                                                root->root_key.objectid,
+                                                trans->transid, key->objectid);
+                        } else {
+                                /*
+                                 * insert the extent pointer in the extent
+                                 * allocation tree
+                                 */
+                                ret = btrfs_alloc_logged_extent(trans, root,
+                                                path->nodes[0]->start,
+                                                root->root_key.objectid,
+                                                trans->transid, key->objectid,
+                                                &ins);
+                                BUG_ON(ret);
+                        }
+                        btrfs_release_path(root, path);
+                        if (btrfs_file_extent_compression(eb, item)) {
+                                csum_start = ins.objectid;
+                                csum_end = csum_start + ins.offset;
+                        } else {
+                                csum_start = ins.objectid +
+                                        btrfs_file_extent_offset(eb, item);
+                                csum_end = csum_start +
+                                        btrfs_file_extent_num_bytes(eb, item);
+                        }
+                        ret = btrfs_lookup_csums_range(root->log_root,
+                                                csum_start, csum_end - 1,
+                                                &ordered_sums);
+                        BUG_ON(ret);
+                        while (!list_empty(&ordered_sums)) {
+                                struct btrfs_ordered_sum *sums;
+                                sums = list_entry(ordered_sums.next,
+                                                struct btrfs_ordered_sum,
+                                                list);
+                                ret = btrfs_csum_file_blocks(trans,
+                                                root->fs_info->csum_root,
+                                                sums);
+                                BUG_ON(ret);
+                                list_del(&sums->list);
+                                kfree(sums);
+                        }
+                } else {
+                        btrfs_release_path(root, path);
+                }
+        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                /* inline extents are easy, we just overwrite them */
+                ret = overwrite_item(trans, root, path, eb, slot, key);
+                BUG_ON(ret);
+        }
+        inode_set_bytes(inode, saved_nbytes);
+        btrfs_update_inode(trans, root, inode);
+out:
+        if (inode)
+                iput(inode);
+        return ret;
+}
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct inode *dir,
+                                      struct btrfs_dir_item *di)
+{
+        struct inode *inode;
+        char *name;
+        int name_len;
+        struct extent_buffer *leaf;
+        struct btrfs_key location;
+        int ret;
+        leaf = path->nodes[0];
+        btrfs_dir_item_key_to_cpu(leaf, di, &location);
+        name_len = btrfs_dir_name_len(leaf, di);
+        name = kmalloc(name_len, GFP_NOFS);
+        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+        btrfs_release_path(root, path);
+        inode = read_one_inode(root, location.objectid);
+        BUG_ON(!inode);
+        ret = link_to_fixup_dir(trans, root, path, location.objectid);
+        BUG_ON(ret);
+        ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+        BUG_ON(ret);
+        kfree(name);
+        iput(inode);
+        return ret;
+}
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 u64 dirid, u64 objectid, u64 index,
+                                 const char *name, int name_len)
+{
+        struct btrfs_dir_item *di;
+        struct btrfs_key location;
+        int match = 0;
+        di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+                                         index, name, name_len, 0);
+        if (di && !IS_ERR(di)) {
+                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+                if (location.objectid != objectid)
+                        goto out;
+        } else
+                goto out;
+        btrfs_release_path(root, path);
+        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+        if (di && !IS_ERR(di)) {
+                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+                if (location.objectid != objectid)
+                        goto out;
+        } else
+                goto out;
+        match = 1;
+out:
+        btrfs_release_path(root, path);
+        return match;
+}
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode.  This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+                                   struct btrfs_key *key,
+                                   char *name, int namelen)
+{
+        struct btrfs_path *path;
+        struct btrfs_inode_ref *ref;
+        unsigned long ptr;
+        unsigned long ptr_end;
+        unsigned long name_ptr;
+        int found_name_len;
+        int item_size;
+        int ret;
+        int match = 0;
+        path = btrfs_alloc_path();
+        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+        if (ret != 0)
+                goto out;
+        item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+        ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+        ptr_end = ptr + item_size;
+        while (ptr < ptr_end) {
+                ref = (struct btrfs_inode_ref *)ptr;
+                found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+                if (found_name_len == namelen) {
+                        name_ptr = (unsigned long)(ref + 1);
+                        ret = memcmp_extent_buffer(path->nodes[0], name,
+                                                   name_ptr, namelen);
+                        if (ret == 0) {
+                                match = 1;
+                                goto out;
+                        }
+                }
+                ptr = (unsigned long)(ref + 1) + found_name_len;
+        }
+out:
+        btrfs_free_path(path);
+        return match;
+}
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct btrfs_root *log,
+                                  struct btrfs_path *path,
+                                  struct extent_buffer *eb, int slot,
+                                  struct btrfs_key *key)
+{
+        struct inode *dir;
+        int ret;
+        struct btrfs_key location;
+        struct btrfs_inode_ref *ref;
+        struct btrfs_dir_item *di;
+        struct inode *inode;
+        char *name;
+        int namelen;
+        unsigned long ref_ptr;
+        unsigned long ref_end;
+        location.objectid = key->objectid;
+        location.type = BTRFS_INODE_ITEM_KEY;
+        location.offset = 0;
+        /*
+         * it is possible that we didn't log all the parent directories
+         * for a given inode.  If we don't find the dir, just don't
+         * copy the back ref in.  The link count fixup code will take
+         * care of the rest
+         */
+        dir = read_one_inode(root, key->offset);
+        if (!dir)
+                return -ENOENT;
+        inode = read_one_inode(root, key->objectid);
+        BUG_ON(!dir);
+        ref_ptr = btrfs_item_ptr_offset(eb, slot);
+        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+again:
+        ref = (struct btrfs_inode_ref *)ref_ptr;
+        namelen = btrfs_inode_ref_name_len(eb, ref);
+        name = kmalloc(namelen, GFP_NOFS);
+        BUG_ON(!name);
+        read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+        /* if we already have a perfect match, we're done */
+        if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+                         btrfs_inode_ref_index(eb, ref),
+                         name, namelen)) {
+                goto out;
+        }
+        /*
+         * look for a conflicting back reference in the metadata.
+         * if we find one we have to unlink that name of the file
+         * before we add our new link.  Later on, we overwrite any
+         * existing back reference, and we don't want to create
+         * dangling pointers in the directory.
+         */
+conflict_again:
+        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+        if (ret == 0) {
+                char *victim_name;
+                int victim_name_len;
+                struct btrfs_inode_ref *victim_ref;
+                unsigned long ptr;
+                unsigned long ptr_end;
+                struct extent_buffer *leaf = path->nodes[0];
+                /* are we trying to overwrite a back ref for the root directory
+                 * if so, just jump out, we're done
+                 */
+                if (key->objectid == key->offset)
+                        goto out_nowrite;
+                /* check all the names in this back reference to see
+                 * if they are in the log.  if so, we allow them to stay
+                 * otherwise they must be unlinked as a conflict
+                 */
+                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+                while (ptr < ptr_end) {
+                        victim_ref = (struct btrfs_inode_ref *)ptr;
+                        victim_name_len = btrfs_inode_ref_name_len(leaf,
+                                                                   victim_ref);
+                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
+                        BUG_ON(!victim_name);
+                        read_extent_buffer(leaf, victim_name,
+                                           (unsigned long)(victim_ref + 1),
+                                           victim_name_len);
+                        if (!backref_in_log(log, key, victim_name,
+                                            victim_name_len)) {
+                                btrfs_inc_nlink(inode);
+                                btrfs_release_path(root, path);
+                                ret = btrfs_unlink_inode(trans, root, dir,
+                                                         inode, victim_name,
+                                                         victim_name_len);
+                                kfree(victim_name);
+                                btrfs_release_path(root, path);
+                                goto conflict_again;
+                        }
+                        kfree(victim_name);
+                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+                }
+                BUG_ON(ret);
+        }
+        btrfs_release_path(root, path);
+        /* look for a conflicting sequence number */
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+                                         btrfs_inode_ref_index(eb, ref),
+                                         name, namelen, 0);
+        if (di && !IS_ERR(di)) {
+                ret = drop_one_dir_item(trans, root, path, dir, di);
+                BUG_ON(ret);
+        }
+        btrfs_release_path(root, path);
+        /* look for a conflicting name */
+        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                   name, namelen, 0);
+        if (di && !IS_ERR(di)) {
+                ret = drop_one_dir_item(trans, root, path, dir, di);
+                BUG_ON(ret);
+        }
+        btrfs_release_path(root, path);
+        /* insert our name */
+        ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
+                             btrfs_inode_ref_index(eb, ref));
+        BUG_ON(ret);
+        btrfs_update_inode(trans, root, inode);
+out:
+        ref_ptr = (unsigned long)(ref + 1) + namelen;
+        kfree(name);
+        if (ref_ptr < ref_end)
+                goto again;
+        /* finally write the back reference in the inode */
+        ret = overwrite_item(trans, root, path, eb, slot, key);
+        BUG_ON(ret);
+out_nowrite:
+        btrfs_release_path(root, path);
+        iput(dir);
+        iput(inode);
+        return 0;
+}
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+                                           struct btrfs_root *root,
+                                           struct inode *inode)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_key key;
+        u64 nlink = 0;
+        unsigned long ptr;
+        unsigned long ptr_end;
+        int name_len;
+        key.objectid = inode->i_ino;
+        key.type = BTRFS_INODE_REF_KEY;
+        key.offset = (u64)-1;
+        path = btrfs_alloc_path();
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        break;
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                      path->slots[0]);
+                if (key.objectid != inode->i_ino ||
+                    key.type != BTRFS_INODE_REF_KEY)
+                        break;
+                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+                ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+                                                   path->slots[0]);
+                while (ptr < ptr_end) {
+                        struct btrfs_inode_ref *ref;
+                        ref = (struct btrfs_inode_ref *)ptr;
+                        name_len = btrfs_inode_ref_name_len(path->nodes[0],
+                                                            ref);
+                        ptr = (unsigned long)(ref + 1) + name_len;
+                        nlink++;
+                }
+                if (key.offset == 0)
+                        break;
+                key.offset--;
+                btrfs_release_path(root, path);
+        }
+        btrfs_free_path(path);
+        if (nlink != inode->i_nlink) {
+                inode->i_nlink = nlink;
+                btrfs_update_inode(trans, root, inode);
+        }
+        BTRFS_I(inode)->index_cnt = (u64)-1;
+        return 0;
+}
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+                                            struct btrfs_root *root,
+                                            struct btrfs_path *path)
+{
+        int ret;
+        struct btrfs_key key;
+        struct inode *inode;
+        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+        key.type = BTRFS_ORPHAN_ITEM_KEY;
+        key.offset = (u64)-1;
+        while (1) {
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret < 0)
+                        break;
+                if (ret == 1) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+                    key.type != BTRFS_ORPHAN_ITEM_KEY)
+                        break;
+                ret = btrfs_del_item(trans, root, path);
+                BUG_ON(ret);
+                btrfs_release_path(root, path);
+                inode = read_one_inode(root, key.offset);
+                BUG_ON(!inode);
+                ret = fixup_inode_link_count(trans, root, inode);
+                BUG_ON(ret);
+                iput(inode);
+                if (key.offset == 0)
+                        break;
+                key.offset--;
+        }
+        btrfs_release_path(root, path);
+        return 0;
+}
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done.  The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      u64 objectid)
+{
+        struct btrfs_key key;
+        int ret = 0;
+        struct inode *inode;
+        inode = read_one_inode(root, objectid);
+        BUG_ON(!inode);
+        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.offset = objectid;
+        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+        btrfs_release_path(root, path);
+        if (ret == 0) {
+                btrfs_inc_nlink(inode);
+                btrfs_update_inode(trans, root, inode);
+        } else if (ret == -EEXIST) {
+                ret = 0;
+        } else {
+                BUG();
+        }
+        iput(inode);
+        return ret;
+}
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist.  This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_path *path,
+                                    u64 dirid, u64 index,
+                                    char *name, int name_len, u8 type,
+                                    struct btrfs_key *location)
+{
+        struct inode *inode;
+        struct inode *dir;
+        int ret;
+        inode = read_one_inode(root, location->objectid);
+        if (!inode)
+                return -ENOENT;
+        dir = read_one_inode(root, dirid);
+        if (!dir) {
+                iput(inode);
+                return -EIO;
+        }
+        ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+        /* FIXME, put inode into FIXUP list */
+        iput(inode);
+        iput(dir);
+        return ret;
+}
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped.  fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_path *path,
+                                    struct extent_buffer *eb,
+                                    struct btrfs_dir_item *di,
+                                    struct btrfs_key *key)
+{
+        char *name;
+        int name_len;
+        struct btrfs_dir_item *dst_di;
+        struct btrfs_key found_key;
+        struct btrfs_key log_key;
+        struct inode *dir;
+        u8 log_type;
+        int exists;
+        int ret;
+        dir = read_one_inode(root, key->objectid);
+        BUG_ON(!dir);
+        name_len = btrfs_dir_name_len(eb, di);
+        name = kmalloc(name_len, GFP_NOFS);
+        log_type = btrfs_dir_type(eb, di);
+        read_extent_buffer(eb, name, (unsigned long)(di + 1),
+                   name_len);
+        btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+        exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+        if (exists == 0)
+                exists = 1;
+        else
+                exists = 0;
+        btrfs_release_path(root, path);
+        if (key->type == BTRFS_DIR_ITEM_KEY) {
+                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+                                       name, name_len, 1);
+        } else if (key->type == BTRFS_DIR_INDEX_KEY) {
+                dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+                                                     key->objectid,
+                                                     key->offset, name,
+                                                     name_len, 1);
+        } else {
+                BUG();
+        }
+        if (!dst_di || IS_ERR(dst_di)) {
+                /* we need a sequence number to insert, so we only
+                 * do inserts for the BTRFS_DIR_INDEX_KEY types
+                 */
+                if (key->type != BTRFS_DIR_INDEX_KEY)
+                        goto out;
+                goto insert;
+        }
+        btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+        /* the existing item matches the logged item */
+        if (found_key.objectid == log_key.objectid &&
+            found_key.type == log_key.type &&
+            found_key.offset == log_key.offset &&
+            btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+                goto out;
+        }
+        /*
+         * don't drop the conflicting directory entry if the inode
+         * for the new entry doesn't exist
+         */
+        if (!exists)
+                goto out;
+        ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+        BUG_ON(ret);
+        if (key->type == BTRFS_DIR_INDEX_KEY)
+                goto insert;
+out:
+        btrfs_release_path(root, path);
+        kfree(name);
+        iput(dir);
+        return 0;
+insert:
+        btrfs_release_path(root, path);
+        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+                              name, name_len, log_type, &log_key);
+        if (ret && ret != -ENOENT)
+                BUG();
+        goto out;
+}
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_path *path,
+                                        struct extent_buffer *eb, int slot,
+                                        struct btrfs_key *key)
+{
+        int ret;
+        u32 item_size = btrfs_item_size_nr(eb, slot);
+        struct btrfs_dir_item *di;
+        int name_len;
+        unsigned long ptr;
+        unsigned long ptr_end;
+        ptr = btrfs_item_ptr_offset(eb, slot);
+        ptr_end = ptr + item_size;
+        while (ptr < ptr_end) {
+                di = (struct btrfs_dir_item *)ptr;
+                name_len = btrfs_dir_name_len(eb, di);
+                ret = replay_one_name(trans, root, path, eb, di, key);
+                BUG_ON(ret);
+                ptr = (unsigned long)(di + 1);
+                ptr += name_len;
+        }
+        return 0;
+}
+/*
+ * directory replay has two parts.  There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for.  During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+                                   struct btrfs_path *path,
+                                   u64 dirid, int key_type,
+                                   u64 *start_ret, u64 *end_ret)
+{
+        struct btrfs_key key;
+        u64 found_end;
+        struct btrfs_dir_log_item *item;
+        int ret;
+        int nritems;
+        if (*start_ret == (u64)-1)
+                return 1;
+        key.objectid = dirid;
+        key.type = key_type;
+        key.offset = *start_ret;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                if (path->slots[0] == 0)
+                        goto out;
+                path->slots[0]--;
+        }
+        if (ret != 0)
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+        if (key.type != key_type || key.objectid != dirid) {
+                ret = 1;
+                goto next;
+        }
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_dir_log_item);
+        found_end = btrfs_dir_log_end(path->nodes[0], item);
+        if (*start_ret >= key.offset && *start_ret <= found_end) {
+                ret = 0;
+                *start_ret = key.offset;
+                *end_ret = found_end;
+                goto out;
+        }
+        ret = 1;
+next:
+        /* check the next slot in the tree to see if it is a valid item */
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        if (path->slots[0] >= nritems) {
+                ret = btrfs_next_leaf(root, path);
+                if (ret)
+                        goto out;
+        } else {
+                path->slots[0]++;
+        }
+        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+        if (key.type != key_type || key.objectid != dirid) {
+                ret = 1;
+                goto out;
+        }
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_dir_log_item);
+        found_end = btrfs_dir_log_end(path->nodes[0], item);
+        *start_ret = key.offset;
+        *end_ret = found_end;
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        return ret;
+}
+/*
+ * this looks for a given directory item in the log.  If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_root *log,
+                                      struct btrfs_path *path,
+                                      struct btrfs_path *log_path,
+                                      struct inode *dir,
+                                      struct btrfs_key *dir_key)
+{
+        int ret;
+        struct extent_buffer *eb;
+        int slot;
+        u32 item_size;
+        struct btrfs_dir_item *di;
+        struct btrfs_dir_item *log_di;
+        int name_len;
+        unsigned long ptr;
+        unsigned long ptr_end;
+        char *name;
+        struct inode *inode;
+        struct btrfs_key location;
+again:
+        eb = path->nodes[0];
+        slot = path->slots[0];
+        item_size = btrfs_item_size_nr(eb, slot);
+        ptr = btrfs_item_ptr_offset(eb, slot);
+        ptr_end = ptr + item_size;
+        while (ptr < ptr_end) {
+                di = (struct btrfs_dir_item *)ptr;
+                name_len = btrfs_dir_name_len(eb, di);
+                name = kmalloc(name_len, GFP_NOFS);
+                if (!name) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                read_extent_buffer(eb, name, (unsigned long)(di + 1),
+                                  name_len);
+                log_di = NULL;
+                if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
+                                                       dir_key->objectid,
+                                                       name, name_len, 0);
+                } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+                        log_di = btrfs_lookup_dir_index_item(trans, log,
+                                                     log_path,
+                                                     dir_key->objectid,
+                                                     dir_key->offset,
+                                                     name, name_len, 0);
+                }
+                if (!log_di || IS_ERR(log_di)) {
+                        btrfs_dir_item_key_to_cpu(eb, di, &location);
+                        btrfs_release_path(root, path);
+                        btrfs_release_path(log, log_path);
+                        inode = read_one_inode(root, location.objectid);
+                        BUG_ON(!inode);
+                        ret = link_to_fixup_dir(trans, root,
+                                                path, location.objectid);
+                        BUG_ON(ret);
+                        btrfs_inc_nlink(inode);
+                        ret = btrfs_unlink_inode(trans, root, dir, inode,
+                                                 name, name_len);
+                        BUG_ON(ret);
+                        kfree(name);
+                        iput(inode);
+                        /* there might still be more names under this key
+                         * check and repeat if required
+                         */
+                        ret = btrfs_search_slot(NULL, root, dir_key, path,
+                                                0, 0);
+                        if (ret == 0)
+                                goto again;
+                        ret = 0;
+                        goto out;
+                }
+                btrfs_release_path(log, log_path);
+                kfree(name);
+                ptr = (unsigned long)(di + 1);
+                ptr += name_len;
+        }
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        btrfs_release_path(log, log_path);
+        return ret;
+}
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes.  It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_root *log,
+                                       struct btrfs_path *path,
+                                       u64 dirid)
+{
+        u64 range_start;
+        u64 range_end;
+        int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+        int ret = 0;
+        struct btrfs_key dir_key;
+        struct btrfs_key found_key;
+        struct btrfs_path *log_path;
+        struct inode *dir;
+        dir_key.objectid = dirid;
+        dir_key.type = BTRFS_DIR_ITEM_KEY;
+        log_path = btrfs_alloc_path();
+        if (!log_path)
+                return -ENOMEM;
+        dir = read_one_inode(root, dirid);
+        /* it isn't an error if the inode isn't there, that can happen
+         * because we replay the deletes before we copy in the inode item
+         * from the log
+         */
+        if (!dir) {
+                btrfs_free_path(log_path);
+                return 0;
+        }
+again:
+        range_start = 0;
+        range_end = 0;
+        while (1) {
+                ret = find_dir_range(log, path, dirid, key_type,
+                                     &range_start, &range_end);
+                if (ret != 0)
+                        break;
+                dir_key.offset = range_start;
+                while (1) {
+                        int nritems;
+                        ret = btrfs_search_slot(NULL, root, &dir_key, path,
+                                                0, 0);
+                        if (ret < 0)
+                                goto out;
+                        nritems = btrfs_header_nritems(path->nodes[0]);
+                        if (path->slots[0] >= nritems) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret)
+                                        break;
+                        }
+                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                              path->slots[0]);
+                        if (found_key.objectid != dirid ||
+                            found_key.type != dir_key.type)
+                                goto next_type;
+                        if (found_key.offset > range_end)
+                                break;
+                        ret = check_item_in_log(trans, root, log, path,
+                                                log_path, dir, &found_key);
+                        BUG_ON(ret);
+                        if (found_key.offset == (u64)-1)
+                                break;
+                        dir_key.offset = found_key.offset + 1;
+                }
+                btrfs_release_path(root, path);
+                if (range_end == (u64)-1)
+                        break;
+                range_start = range_end + 1;
+        }
+next_type:
+        ret = 0;
+        if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+                key_type = BTRFS_DIR_LOG_INDEX_KEY;
+                dir_key.type = BTRFS_DIR_INDEX_KEY;
+                btrfs_release_path(root, path);
+                goto again;
+        }
+out:
+        btrfs_release_path(root, path);
+        btrfs_free_path(log_path);
+        iput(dir);
+        return ret;
+}
+/*
+ * the process_func used to replay items from the log tree.  This
+ * gets called in two different stages.  The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume.  The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+                             struct walk_control *wc, u64 gen)
+{
+        int nritems;
+        struct btrfs_path *path;
+        struct btrfs_root *root = wc->replay_dest;
+        struct btrfs_key key;
+        u32 item_size;
+        int level;
+        int i;
+        int ret;
+        btrfs_read_buffer(eb, gen);
+        level = btrfs_header_level(eb);
+        if (level != 0)
+                return 0;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        nritems = btrfs_header_nritems(eb);
+        for (i = 0; i < nritems; i++) {
+                btrfs_item_key_to_cpu(eb, &key, i);
+                item_size = btrfs_item_size_nr(eb, i);
+                /* inode keys are done during the first stage */
+                if (key.type == BTRFS_INODE_ITEM_KEY &&
+                    wc->stage == LOG_WALK_REPLAY_INODES) {
+                        struct inode *inode;
+                        struct btrfs_inode_item *inode_item;
+                        u32 mode;
+                        inode_item = btrfs_item_ptr(eb, i,
+                                            struct btrfs_inode_item);
+                        mode = btrfs_inode_mode(eb, inode_item);
+                        if (S_ISDIR(mode)) {
+                                ret = replay_dir_deletes(wc->trans,
+                                         root, log, path, key.objectid);
+                                BUG_ON(ret);
+                        }
+                        ret = overwrite_item(wc->trans, root, path,
+                                             eb, i, &key);
+                        BUG_ON(ret);
+                        /* for regular files, truncate away
+                         * extents past the new EOF
+                         */
+                        if (S_ISREG(mode)) {
+                                inode = read_one_inode(root,
+                                                       key.objectid);
+                                BUG_ON(!inode);
+                                ret = btrfs_truncate_inode_items(wc->trans,
+                                        root, inode, inode->i_size,
+                                        BTRFS_EXTENT_DATA_KEY);
+                                BUG_ON(ret);
+                                iput(inode);
+                        }
+                        ret = link_to_fixup_dir(wc->trans, root,
+                                                path, key.objectid);
+                        BUG_ON(ret);
+                }
+                if (wc->stage < LOG_WALK_REPLAY_ALL)
+                        continue;
+                /* these keys are simply copied */
+                if (key.type == BTRFS_XATTR_ITEM_KEY) {
+                        ret = overwrite_item(wc->trans, root, path,
+                                             eb, i, &key);
+                        BUG_ON(ret);
+                } else if (key.type == BTRFS_INODE_REF_KEY) {
+                        ret = add_inode_ref(wc->trans, root, log, path,
+                                            eb, i, &key);
+                        BUG_ON(ret && ret != -ENOENT);
+                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+                        ret = replay_one_extent(wc->trans, root, path,
+                                                eb, i, &key);
+                        BUG_ON(ret);
+                } else if (key.type == BTRFS_DIR_ITEM_KEY ||
+                           key.type == BTRFS_DIR_INDEX_KEY) {
+                        ret = replay_one_dir_item(wc->trans, root, path,
+                                                  eb, i, &key);
+                        BUG_ON(ret);
+                }
+        }
+        btrfs_free_path(path);
+        return 0;
+}
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path, int *level,
+                                   struct walk_control *wc)
+{
+        u64 root_owner;
+        u64 root_gen;
+        u64 bytenr;
+        u64 ptr_gen;
+        struct extent_buffer *next;
+        struct extent_buffer *cur;
+        struct extent_buffer *parent;
+        u32 blocksize;
+        int ret = 0;
+        WARN_ON(*level < 0);
+        WARN_ON(*level >= BTRFS_MAX_LEVEL);
+        while (*level > 0) {
+                WARN_ON(*level < 0);
+                WARN_ON(*level >= BTRFS_MAX_LEVEL);
+                cur = path->nodes[*level];
+                if (btrfs_header_level(cur) != *level)
+                        WARN_ON(1);
+                if (path->slots[*level] >=
+                    btrfs_header_nritems(cur))
+                        break;
+                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+                blocksize = btrfs_level_size(root, *level - 1);
+                parent = path->nodes[*level];
+                root_owner = btrfs_header_owner(parent);
+                root_gen = btrfs_header_generation(parent);
+                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+                wc->process_func(root, next, wc, ptr_gen);
+                if (*level == 1) {
+                        path->slots[*level]++;
+                        if (wc->free) {
+                                btrfs_read_buffer(next, ptr_gen);
+                                btrfs_tree_lock(next);
+                                clean_tree_block(trans, root, next);
+                                btrfs_wait_tree_block_writeback(next);
+                                btrfs_tree_unlock(next);
+                                ret = btrfs_drop_leaf_ref(trans, root, next);
+                                BUG_ON(ret);
+                                WARN_ON(root_owner !=
+                                        BTRFS_TREE_LOG_OBJECTID);
+                                ret = btrfs_free_reserved_extent(root,
+                                                         bytenr, blocksize);
+                                BUG_ON(ret);
+                        }
+                        free_extent_buffer(next);
+                        continue;
+                }
+                btrfs_read_buffer(next, ptr_gen);
+                WARN_ON(*level <= 0);
+                if (path->nodes[*level-1])
+                        free_extent_buffer(path->nodes[*level-1]);
+                path->nodes[*level-1] = next;
+                *level = btrfs_header_level(next);
+                path->slots[*level] = 0;
+                cond_resched();
+        }
+        WARN_ON(*level < 0);
+        WARN_ON(*level >= BTRFS_MAX_LEVEL);
+        if (path->nodes[*level] == root->node)
+                parent = path->nodes[*level];
+        else
+                parent = path->nodes[*level + 1];
+        bytenr = path->nodes[*level]->start;
+        blocksize = btrfs_level_size(root, *level);
+        root_owner = btrfs_header_owner(parent);
+        root_gen = btrfs_header_generation(parent);
+        wc->process_func(root, path->nodes[*level], wc,
+                         btrfs_header_generation(path->nodes[*level]));
+        if (wc->free) {
+                next = path->nodes[*level];
+                btrfs_tree_lock(next);
+                clean_tree_block(trans, root, next);
+                btrfs_wait_tree_block_writeback(next);
+                btrfs_tree_unlock(next);
+                if (*level == 0) {
+                        ret = btrfs_drop_leaf_ref(trans, root, next);
+                        BUG_ON(ret);
+                }
+                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+                ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
+                BUG_ON(ret);
+        }
+        free_extent_buffer(path->nodes[*level]);
+        path->nodes[*level] = NULL;
+        *level += 1;
+        cond_resched();
+        return 0;
+}
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path, int *level,
+                                 struct walk_control *wc)
+{
+        u64 root_owner;
+        u64 root_gen;
+        int i;
+        int slot;
+        int ret;
+        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+                slot = path->slots[i];
+                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                        struct extent_buffer *node;
+                        node = path->nodes[i];
+                        path->slots[i]++;
+                        *level = i;
+                        WARN_ON(*level == 0);
+                        return 0;
+                } else {
+                        struct extent_buffer *parent;
+                        if (path->nodes[*level] == root->node)
+                                parent = path->nodes[*level];
+                        else
+                                parent = path->nodes[*level + 1];
+                        root_owner = btrfs_header_owner(parent);
+                        root_gen = btrfs_header_generation(parent);
+                        wc->process_func(root, path->nodes[*level], wc,
+                                 btrfs_header_generation(path->nodes[*level]));
+                        if (wc->free) {
+                                struct extent_buffer *next;
+                                next = path->nodes[*level];
+                                btrfs_tree_lock(next);
+                                clean_tree_block(trans, root, next);
+                                btrfs_wait_tree_block_writeback(next);
+                                btrfs_tree_unlock(next);
+                                if (*level == 0) {
+                                        ret = btrfs_drop_leaf_ref(trans, root,
+                                                                  next);
+                                        BUG_ON(ret);
+                                }
+                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+                                ret = btrfs_free_reserved_extent(root,
+                                                path->nodes[*level]->start,
+                                                path->nodes[*level]->len);
+                                BUG_ON(ret);
+                        }
+                        free_extent_buffer(path->nodes[*level]);
+                        path->nodes[*level] = NULL;
+                        *level = i + 1;
+                }
+        }
+        return 1;
+}
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *log, struct walk_control *wc)
+{
+        int ret = 0;
+        int wret;
+        int level;
+        struct btrfs_path *path;
+        int i;
+        int orig_level;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        level = btrfs_header_level(log->node);
+        orig_level = level;
+        path->nodes[level] = log->node;
+        extent_buffer_get(log->node);
+        path->slots[level] = 0;
+        while (1) {
+                wret = walk_down_log_tree(trans, log, path, &level, wc);
+                if (wret > 0)
+                        break;
+                if (wret < 0)
+                        ret = wret;
+                wret = walk_up_log_tree(trans, log, path, &level, wc);
+                if (wret > 0)
+                        break;
+                if (wret < 0)
+                        ret = wret;
+        }
+        /* was the root node processed? if not, catch it here */
+        if (path->nodes[orig_level]) {
+                wc->process_func(log, path->nodes[orig_level], wc,
+                         btrfs_header_generation(path->nodes[orig_level]));
+                if (wc->free) {
+                        struct extent_buffer *next;
+                        next = path->nodes[orig_level];
+                        btrfs_tree_lock(next);
+                        clean_tree_block(trans, log, next);
+                        btrfs_wait_tree_block_writeback(next);
+                        btrfs_tree_unlock(next);
+                        if (orig_level == 0) {
+                                ret = btrfs_drop_leaf_ref(trans, log,
+                                                          next);
+                                BUG_ON(ret);
+                        }
+                        WARN_ON(log->root_key.objectid !=
+                                BTRFS_TREE_LOG_OBJECTID);
+                        ret = btrfs_free_reserved_extent(log, next->start,
+                                                         next->len);
+                        BUG_ON(ret);
+                }
+        }
+        for (i = 0; i <= orig_level; i++) {
+                if (path->nodes[i]) {
+                        free_extent_buffer(path->nodes[i]);
+                        path->nodes[i] = NULL;
+                }
+        }
+        btrfs_free_path(path);
+        if (wc->free)
+                free_extent_buffer(log->node);
+        return ret;
+}
+static int wait_log_commit(struct btrfs_root *log)
+{
+        DEFINE_WAIT(wait);
+        u64 transid = log->fs_info->tree_log_transid;
+        do {
+                prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                mutex_unlock(&log->fs_info->tree_log_mutex);
+                if (atomic_read(&log->fs_info->tree_log_commit))
+                        schedule();
+                finish_wait(&log->fs_info->tree_log_wait, &wait);
+                mutex_lock(&log->fs_info->tree_log_mutex);
+        } while (transid == log->fs_info->tree_log_transid &&
+                atomic_read(&log->fs_info->tree_log_commit));
+        return 0;
+}
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it.  When this call is done,
+ * you know that any inodes previously logged are safely on disk
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+                   struct btrfs_root *root)
+{
+        int ret;
+        unsigned long batch;
+        struct btrfs_root *log = root->log_root;
+        mutex_lock(&log->fs_info->tree_log_mutex);
+        if (atomic_read(&log->fs_info->tree_log_commit)) {
+                wait_log_commit(log);
+                goto out;
+        }
+        atomic_set(&log->fs_info->tree_log_commit, 1);
+        while (1) {
+                batch = log->fs_info->tree_log_batch;
+                mutex_unlock(&log->fs_info->tree_log_mutex);
+                schedule_timeout_uninterruptible(1);
+                mutex_lock(&log->fs_info->tree_log_mutex);
+                while (atomic_read(&log->fs_info->tree_log_writers)) {
+                        DEFINE_WAIT(wait);
+                        prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        mutex_unlock(&log->fs_info->tree_log_mutex);
+                        if (atomic_read(&log->fs_info->tree_log_writers))
+                                schedule();
+                        mutex_lock(&log->fs_info->tree_log_mutex);
+                        finish_wait(&log->fs_info->tree_log_wait, &wait);
+                }
+                if (batch == log->fs_info->tree_log_batch)
+                        break;
+        }
+        ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
+        BUG_ON(ret);
+        ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
+                               &root->fs_info->log_root_tree->dirty_log_pages);
+        BUG_ON(ret);
+        btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+                                 log->fs_info->log_root_tree->node->start);
+        btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+                       btrfs_header_level(log->fs_info->log_root_tree->node));
+        write_ctree_super(trans, log->fs_info->tree_root, 2);
+        log->fs_info->tree_log_transid++;
+        log->fs_info->tree_log_batch = 0;
+        atomic_set(&log->fs_info->tree_log_commit, 0);
+        smp_mb();
+        if (waitqueue_active(&log->fs_info->tree_log_wait))
+                wake_up(&log->fs_info->tree_log_wait);
+out:
+        mutex_unlock(&log->fs_info->tree_log_mutex);
+        return 0;
+}
+/* * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+        int ret;
+        struct btrfs_root *log;
+        struct key;
+        u64 start;
+        u64 end;
+        struct walk_control wc = {
+                .free = 1,
+                .process_func = process_one_buffer
+        };
+        if (!root->log_root || root->fs_info->log_root_recovering)
+                return 0;
+        log = root->log_root;
+        ret = walk_log_tree(trans, log, &wc);
+        BUG_ON(ret);
+        while (1) {
+                ret = find_first_extent_bit(&log->dirty_log_pages,
+                                    0, &start, &end, EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_dirty(&log->dirty_log_pages,
+                                   start, end, GFP_NOFS);
+        }
+        log = root->log_root;
+        ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+                             &log->root_key);
+        BUG_ON(ret);
+        root->log_root = NULL;
+        kfree(root->log_root);
+        return 0;
+}
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *log)
+{
+        u64 bytenr = btrfs_root_bytenr(&log->root_item);
+        int ret;
+        if (log->node->start == bytenr)
+                return 0;
+        btrfs_set_root_bytenr(&log->root_item, log->node->start);
+        btrfs_set_root_generation(&log->root_item, trans->transid);
+        btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+        ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+                                &log->root_key, &log->root_item);
+        BUG_ON(ret);
+        return ret;
+}
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist.  But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked.  Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 const char *name, int name_len,
+                                 struct inode *dir, u64 index)
+{
+        struct btrfs_root *log;
+        struct btrfs_dir_item *di;
+        struct btrfs_path *path;
+        int ret;
+        int bytes_del = 0;
+        if (BTRFS_I(dir)->logged_trans < trans->transid)
+                return 0;
+        ret = join_running_log_trans(root);
+        if (ret)
+                return 0;
+        mutex_lock(&BTRFS_I(dir)->log_mutex);
+        log = root->log_root;
+        path = btrfs_alloc_path();
+        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+                                   name, name_len, -1);
+        if (di && !IS_ERR(di)) {
+                ret = btrfs_delete_one_dir_name(trans, log, path, di);
+                bytes_del += name_len;
+                BUG_ON(ret);
+        }
+        btrfs_release_path(log, path);
+        di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+                                         index, name, name_len, -1);
+        if (di && !IS_ERR(di)) {
+                ret = btrfs_delete_one_dir_name(trans, log, path, di);
+                bytes_del += name_len;
+                BUG_ON(ret);
+        }
+        /* update the directory size in the log to reflect the names
+         * we have removed
+         */
+        if (bytes_del) {
+                struct btrfs_key key;
+                key.objectid = dir->i_ino;
+                key.offset = 0;
+                key.type = BTRFS_INODE_ITEM_KEY;
+                btrfs_release_path(log, path);
+                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+                if (ret == 0) {
+                        struct btrfs_inode_item *item;
+                        u64 i_size;
+                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                              struct btrfs_inode_item);
+                        i_size = btrfs_inode_size(path->nodes[0], item);
+                        if (i_size > bytes_del)
+                                i_size -= bytes_del;
+                        else
+                                i_size = 0;
+                        btrfs_set_inode_size(path->nodes[0], item, i_size);
+                        btrfs_mark_buffer_dirty(path->nodes[0]);
+                } else
+                        ret = 0;
+                btrfs_release_path(log, path);
+        }
+        btrfs_free_path(path);
+        mutex_unlock(&BTRFS_I(dir)->log_mutex);
+        end_log_trans(root);
+        return 0;
+}
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               const char *name, int name_len,
+                               struct inode *inode, u64 dirid)
+{
+        struct btrfs_root *log;
+        u64 index;
+        int ret;
+        if (BTRFS_I(inode)->logged_trans < trans->transid)
+                return 0;
+        ret = join_running_log_trans(root);
+        if (ret)
+                return 0;
+        log = root->log_root;
+        mutex_lock(&BTRFS_I(inode)->log_mutex);
+        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+                                  dirid, &index);
+        mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        end_log_trans(root);
+        return ret;
+}
+/*
+ * creates a range item in the log for 'dirid'.  first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *log,
+                                       struct btrfs_path *path,
+                                       int key_type, u64 dirid,
+                                       u64 first_offset, u64 last_offset)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_dir_log_item *item;
+        key.objectid = dirid;
+        key.offset = first_offset;
+        if (key_type == BTRFS_DIR_ITEM_KEY)
+                key.type = BTRFS_DIR_LOG_ITEM_KEY;
+        else
+                key.type = BTRFS_DIR_LOG_INDEX_KEY;
+        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+        BUG_ON(ret);
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_dir_log_item);
+        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(log, path);
+        return 0;
+}
+/*
+ * log all the items included in the current transaction for a given
+ * directory.  This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct inode *inode,
+                          struct btrfs_path *path,
+                          struct btrfs_path *dst_path, int key_type,
+                          u64 min_offset, u64 *last_offset_ret)
+{
+        struct btrfs_key min_key;
+        struct btrfs_key max_key;
+        struct btrfs_root *log = root->log_root;
+        struct extent_buffer *src;
+        int ret;
+        int i;
+        int nritems;
+        u64 first_offset = min_offset;
+        u64 last_offset = (u64)-1;
+        log = root->log_root;
+        max_key.objectid = inode->i_ino;
+        max_key.offset = (u64)-1;
+        max_key.type = key_type;
+        min_key.objectid = inode->i_ino;
+        min_key.type = key_type;
+        min_key.offset = min_offset;
+        path->keep_locks = 1;
+        ret = btrfs_search_forward(root, &min_key, &max_key,
+                                   path, 0, trans->transid);
+        /*
+         * we didn't find anything from this transaction, see if there
+         * is anything at all
+         */
+        if (ret != 0 || min_key.objectid != inode->i_ino ||
+            min_key.type != key_type) {
+                min_key.objectid = inode->i_ino;
+                min_key.type = key_type;
+                min_key.offset = (u64)-1;
+                btrfs_release_path(root, path);
+                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+                if (ret < 0) {
+                        btrfs_release_path(root, path);
+                        return ret;
+                }
+                ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+                /* if ret == 0 there are items for this type,
+                 * create a range to tell us the last key of this type.
+                 * otherwise, there are no items in this directory after
+                 * *min_offset, and we create a range to indicate that.
+                 */
+                if (ret == 0) {
+                        struct btrfs_key tmp;
+                        btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+                                              path->slots[0]);
+                        if (key_type == tmp.type)
+                                first_offset = max(min_offset, tmp.offset) + 1;
+                }
+                goto done;
+        }
+        /* go backward to find any previous key */
+        ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+        if (ret == 0) {
+                struct btrfs_key tmp;
+                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+                if (key_type == tmp.type) {
+                        first_offset = tmp.offset;
+                        ret = overwrite_item(trans, log, dst_path,
+                                             path->nodes[0], path->slots[0],
+                                             &tmp);
+                }
+        }
+        btrfs_release_path(root, path);
+        /* find the first key from this transaction again */
+        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+        if (ret != 0) {
+                WARN_ON(1);
+                goto done;
+        }
+        /*
+         * we have a block from this transaction, log every item in it
+         * from our directory
+         */
+        while (1) {
+                struct btrfs_key tmp;
+                src = path->nodes[0];
+                nritems = btrfs_header_nritems(src);
+                for (i = path->slots[0]; i < nritems; i++) {
+                        btrfs_item_key_to_cpu(src, &min_key, i);
+                        if (min_key.objectid != inode->i_ino ||
+                            min_key.type != key_type)
+                                goto done;
+                        ret = overwrite_item(trans, log, dst_path, src, i,
+                                             &min_key);
+                        BUG_ON(ret);
+                }
+                path->slots[0] = nritems;
+                /*
+                 * look ahead to the next item and see if it is also
+                 * from this directory and from this transaction
+                 */
+                ret = btrfs_next_leaf(root, path);
+                if (ret == 1) {
+                        last_offset = (u64)-1;
+                        goto done;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+                if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+                        last_offset = (u64)-1;
+                        goto done;
+                }
+                if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+                        ret = overwrite_item(trans, log, dst_path,
+                                             path->nodes[0], path->slots[0],
+                                             &tmp);
+                        BUG_ON(ret);
+                        last_offset = tmp.offset;
+                        goto done;
+                }
+        }
+done:
+        *last_offset_ret = last_offset;
+        btrfs_release_path(root, path);
+        btrfs_release_path(log, dst_path);
+        /* insert the log range keys to indicate where the log is valid */
+        ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+                                 first_offset, last_offset);
+        BUG_ON(ret);
+        return 0;
+}
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct inode *inode,
+                          struct btrfs_path *path,
+                          struct btrfs_path *dst_path)
+{
+        u64 min_key;
+        u64 max_key;
+        int ret;
+        int key_type = BTRFS_DIR_ITEM_KEY;
+again:
+        min_key = 0;
+        max_key = 0;
+        while (1) {
+                ret = log_dir_items(trans, root, inode, path,
+                                    dst_path, key_type, min_key,
+                                    &max_key);
+                BUG_ON(ret);
+                if (max_key == (u64)-1)
+                        break;
+                min_key = max_key + 1;
+        }
+        if (key_type == BTRFS_DIR_ITEM_KEY) {
+                key_type = BTRFS_DIR_INDEX_KEY;
+                goto again;
+        }
+        return 0;
+}
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode.  max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *log,
+                                  struct btrfs_path *path,
+                                  u64 objectid, int max_key_type)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        key.objectid = objectid;
+        key.type = max_key_type;
+        key.offset = (u64)-1;
+        while (1) {
+                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+                if (ret != 1)
+                        break;
+                if (path->slots[0] == 0)
+                        break;
+                path->slots[0]--;
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                if (found_key.objectid != objectid)
+                        break;
+                ret = btrfs_del_item(trans, log, path);
+                BUG_ON(ret);
+                btrfs_release_path(log, path);
+        }
+        btrfs_release_path(log, path);
+        return 0;
+}
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *log,
+                               struct btrfs_path *dst_path,
+                               struct extent_buffer *src,
+                               int start_slot, int nr, int inode_only)
+{
+        unsigned long src_offset;
+        unsigned long dst_offset;
+        struct btrfs_file_extent_item *extent;
+        struct btrfs_inode_item *inode_item;
+        int ret;
+        struct btrfs_key *ins_keys;
+        u32 *ins_sizes;
+        char *ins_data;
+        int i;
+        struct list_head ordered_sums;
+        INIT_LIST_HEAD(&ordered_sums);
+        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+                           nr * sizeof(u32), GFP_NOFS);
+        ins_sizes = (u32 *)ins_data;
+        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+        for (i = 0; i < nr; i++) {
+                ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+                btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+        }
+        ret = btrfs_insert_empty_items(trans, log, dst_path,
+                                       ins_keys, ins_sizes, nr);
+        BUG_ON(ret);
+        for (i = 0; i < nr; i++) {
+                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+                                                   dst_path->slots[0]);
+                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+                copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+                                   src_offset, ins_sizes[i]);
+                if (inode_only == LOG_INODE_EXISTS &&
+                    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
+                                                    dst_path->slots[0],
+                                                    struct btrfs_inode_item);
+                        btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+                        /* set the generation to zero so the recover code
+                         * can tell the difference between an logging
+                         * just to say 'this inode exists' and a logging
+                         * to say 'update this inode with these values'
+                         */
+                        btrfs_set_inode_generation(dst_path->nodes[0],
+                                                   inode_item, 0);
+                }
+                /* take a reference on file data extents so that truncates
+                 * or deletes of this inode don't have to relog the inode
+                 * again
+                 */
+                if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+                        int found_type;
+                        extent = btrfs_item_ptr(src, start_slot + i,
+                                                struct btrfs_file_extent_item);
+                        found_type = btrfs_file_extent_type(src, extent);
+                        if (found_type == BTRFS_FILE_EXTENT_REG ||
+                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                u64 ds = btrfs_file_extent_disk_bytenr(src,
+                                                                   extent);
+                                u64 dl = btrfs_file_extent_disk_num_bytes(src,
+                                                                      extent);
+                                u64 cs = btrfs_file_extent_offset(src, extent);
+                                u64 cl = btrfs_file_extent_num_bytes(src,
+                                                                     extent);;
+                                if (btrfs_file_extent_compression(src,
+                                                                  extent)) {
+                                        cs = 0;
+                                        cl = dl;
+                                }
+                                /* ds == 0 is a hole */
+                                if (ds != 0) {
+                                        ret = btrfs_inc_extent_ref(trans, log,
+                                                   ds, dl,
+                                                   dst_path->nodes[0]->start,
+                                                   BTRFS_TREE_LOG_OBJECTID,
+                                                   trans->transid,
+                                                   ins_keys[i].objectid);
+                                        BUG_ON(ret);
+                                        ret = btrfs_lookup_csums_range(
+                                                   log->fs_info->csum_root,
+                                                   ds + cs, ds + cs + cl - 1,
+                                                   &ordered_sums);
+                                        BUG_ON(ret);
+                                }
+                        }
+                }
+                dst_path->slots[0]++;
+        }
+        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+        btrfs_release_path(log, dst_path);
+        kfree(ins_data);
+        /*
+         * we have to do this after the loop above to avoid changing the
+         * log tree while trying to change the log tree.
+         */
+        while (!list_empty(&ordered_sums)) {
+                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+                                                   struct btrfs_ordered_sum,
+                                                   list);
+                ret = btrfs_csum_file_blocks(trans, log, sums);
+                BUG_ON(ret);
+                list_del(&sums->list);
+                kfree(sums);
+        }
+        return 0;
+}
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree.  An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, struct inode *inode,
+                             int inode_only)
+{
+        struct btrfs_path *path;
+        struct btrfs_path *dst_path;
+        struct btrfs_key min_key;
+        struct btrfs_key max_key;
+        struct btrfs_root *log = root->log_root;
+        struct extent_buffer *src = NULL;
+        u32 size;
+        int ret;
+        int nritems;
+        int ins_start_slot = 0;
+        int ins_nr;
+        log = root->log_root;
+        path = btrfs_alloc_path();
+        dst_path = btrfs_alloc_path();
+        min_key.objectid = inode->i_ino;
+        min_key.type = BTRFS_INODE_ITEM_KEY;
+        min_key.offset = 0;
+        max_key.objectid = inode->i_ino;
+        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+                max_key.type = BTRFS_XATTR_ITEM_KEY;
+        else
+                max_key.type = (u8)-1;
+        max_key.offset = (u64)-1;
+        /*
+         * if this inode has already been logged and we're in inode_only
+         * mode, we don't want to delete the things that have already
+         * been written to the log.
+         *
+         * But, if the inode has been through an inode_only log,
+         * the logged_trans field is not set.  This allows us to catch
+         * any new names for this inode in the backrefs by logging it
+         * again
+         */
+        if (inode_only == LOG_INODE_EXISTS &&
+            BTRFS_I(inode)->logged_trans == trans->transid) {
+                btrfs_free_path(path);
+                btrfs_free_path(dst_path);
+                goto out;
+        }
+        mutex_lock(&BTRFS_I(inode)->log_mutex);
+        /*
+         * a brute force approach to making sure we get the most uptodate
+         * copies of everything.
+         */
+        if (S_ISDIR(inode->i_mode)) {
+                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+                if (inode_only == LOG_INODE_EXISTS)
+                        max_key_type = BTRFS_XATTR_ITEM_KEY;
+                ret = drop_objectid_items(trans, log, path,
+                                          inode->i_ino, max_key_type);
+        } else {
+                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+        }
+        BUG_ON(ret);
+        path->keep_locks = 1;
+        while (1) {
+                ins_nr = 0;
+                ret = btrfs_search_forward(root, &min_key, &max_key,
+                                           path, 0, trans->transid);
+                if (ret != 0)
+                        break;
+again:
+                /* note, ins_nr might be > 0 here, cleanup outside the loop */
+                if (min_key.objectid != inode->i_ino)
+                        break;
+                if (min_key.type > max_key.type)
+                        break;
+                src = path->nodes[0];
+                size = btrfs_item_size_nr(src, path->slots[0]);
+                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+                        ins_nr++;
+                        goto next_slot;
+                } else if (!ins_nr) {
+                        ins_start_slot = path->slots[0];
+                        ins_nr = 1;
+                        goto next_slot;
+                }
+                ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+                                 ins_nr, inode_only);
+                BUG_ON(ret);
+                ins_nr = 1;
+                ins_start_slot = path->slots[0];
+next_slot:
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                path->slots[0]++;
+                if (path->slots[0] < nritems) {
+                        btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+                                              path->slots[0]);
+                        goto again;
+                }
+                if (ins_nr) {
+                        ret = copy_items(trans, log, dst_path, src,
+                                         ins_start_slot,
+                                         ins_nr, inode_only);
+                        BUG_ON(ret);
+                        ins_nr = 0;
+                }
+                btrfs_release_path(root, path);
+                if (min_key.offset < (u64)-1)
+                        min_key.offset++;
+                else if (min_key.type < (u8)-1)
+                        min_key.type++;
+                else if (min_key.objectid < (u64)-1)
+                        min_key.objectid++;
+                else
+                        break;
+        }
+        if (ins_nr) {
+                ret = copy_items(trans, log, dst_path, src,
+                                 ins_start_slot,
+                                 ins_nr, inode_only);
+                BUG_ON(ret);
+                ins_nr = 0;
+        }
+        WARN_ON(ins_nr);
+        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+                btrfs_release_path(root, path);
+                btrfs_release_path(log, dst_path);
+                BTRFS_I(inode)->log_dirty_trans = 0;
+                ret = log_directory_changes(trans, root, inode, path, dst_path);
+                BUG_ON(ret);
+        }
+        BTRFS_I(inode)->logged_trans = trans->transid;
+        mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        btrfs_free_path(path);
+        btrfs_free_path(dst_path);
+        mutex_lock(&root->fs_info->tree_log_mutex);
+        ret = update_log_root(trans, log);
+        BUG_ON(ret);
+        mutex_unlock(&root->fs_info->tree_log_mutex);
+out:
+        return 0;
+}
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct inode *inode,
+                    int inode_only)
+{
+        int ret;
+        start_log_trans(trans, root);
+        ret = __btrfs_log_inode(trans, root, inode, inode_only);
+        end_log_trans(root);
+        return ret;
+}
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log.  A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct dentry *dentry)
+{
+        int inode_only = LOG_INODE_ALL;
+        struct super_block *sb;
+        int ret;
+        start_log_trans(trans, root);
+        sb = dentry->d_inode->i_sb;
+        while (1) {
+                ret = __btrfs_log_inode(trans, root, dentry->d_inode,
+                                        inode_only);
+                BUG_ON(ret);
+                inode_only = LOG_INODE_EXISTS;
+                dentry = dentry->d_parent;
+                if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+                        break;
+                if (BTRFS_I(dentry->d_inode)->generation <=
+                    root->fs_info->last_trans_committed)
+                        break;
+        }
+        end_log_trans(root);
+        return 0;
+}
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct dentry *dentry)
+{
+        u64 gen;
+        gen = root->fs_info->last_trans_new_blockgroup;
+        if (gen > root->fs_info->last_trans_committed)
+                return 1;
+        else
+                return btrfs_log_dentry(trans, root, dentry);
+}
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_key tmp_key;
+        struct btrfs_root *log;
+        struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+        u64 highest_inode;
+        struct walk_control wc = {
+                .process_func = process_one_buffer,
+                .stage = 0,
+        };
+        fs_info->log_root_recovering = 1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        trans = btrfs_start_transaction(fs_info->tree_root, 1);
+        wc.trans = trans;
+        wc.pin = 1;
+        walk_log_tree(trans, log_root_tree, &wc);
+again:
+        key.objectid = BTRFS_TREE_LOG_OBJECTID;
+        key.offset = (u64)-1;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        while (1) {
+                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+                if (ret < 0)
+                        break;
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                btrfs_release_path(log_root_tree, path);
+                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+                        break;
+                log = btrfs_read_fs_root_no_radix(log_root_tree,
+                                                  &found_key);
+                BUG_ON(!log);
+                tmp_key.objectid = found_key.offset;
+                tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+                tmp_key.offset = (u64)-1;
+                wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+                BUG_ON(!wc.replay_dest);
+                wc.replay_dest->log_root = log;
+                btrfs_record_root_in_trans(wc.replay_dest);
+                ret = walk_log_tree(trans, log, &wc);
+                BUG_ON(ret);
+                if (wc.stage == LOG_WALK_REPLAY_ALL) {
+                        ret = fixup_inode_link_counts(trans, wc.replay_dest,
+                                                      path);
+                        BUG_ON(ret);
+                }
+                ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
+                if (ret == 0) {
+                        wc.replay_dest->highest_inode = highest_inode;
+                        wc.replay_dest->last_inode_alloc = highest_inode;
+                }
+                key.offset = found_key.offset - 1;
+                wc.replay_dest->log_root = NULL;
+                free_extent_buffer(log->node);
+                kfree(log);
+                if (found_key.offset == 0)
+                        break;
+        }
+        btrfs_release_path(log_root_tree, path);
+        /* step one is to pin it all, step two is to replay just inodes */
+        if (wc.pin) {
+                wc.pin = 0;
+                wc.process_func = replay_one_buffer;
+                wc.stage = LOG_WALK_REPLAY_INODES;
+                goto again;
+        }
+        /* step three is to replay everything */
+        if (wc.stage < LOG_WALK_REPLAY_ALL) {
+                wc.stage++;
+                goto again;
+        }
+        btrfs_free_path(path);
+        free_extent_buffer(log_root_tree->node);
+        log_root_tree->log_root = NULL;
+        fs_info->log_root_recovering = 0;
+        /* step 4: commit the transaction, which also unpins the blocks */
+        btrfs_commit_transaction(trans, fs_info->tree_root);
+        kfree(log_root_tree);
+        return 0;
+}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+                   struct btrfs_root *root);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct dentry *dentry);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct dentry *dentry);
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct inode *inode,
+                    int inode_only);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 const char *name, int name_len,
+                                 struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               const char *name, int name_len,
+                               struct inode *inode, u64 dirid);
+#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 000000000000..9bf3946d5ef2
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
+#ifndef __BTRFS_VERSION_H
+#define __BTRFS_VERSION_H
+#define BTRFS_BUILD_VERSION "Btrfs"
+#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..1ca1952fd917
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# determine-version -- report a useful version for releases
+#
+# Copyright 2008, Aron Griffis <agriffis@n01se.net>
+# Copyright 2008, Oracle
+# Released under the GNU GPLv2
+ 
+v="v0.16"
+which git &> /dev/null
+if [ $? == 0 ]; then
+    git branch >& /dev/null
+    if [ $? == 0 ]; then
+            if head=`git rev-parse --verify HEAD 2>/dev/null`; then
+                if tag=`git describe --tags 2>/dev/null`; then
+                    v="$tag"
+                fi
+                # Are there uncommitted changes?
+                git update-index --refresh --unmerged > /dev/null
+                if git diff-index --name-only HEAD | \
+                    grep -v "^scripts/package" \
+                    | read dummy; then
+                    v="$v"-dirty
+                fi
+            fi
+    fi
+fi
+ 
+echo "#ifndef __BUILD_VERSION" > .build-version.h
+echo "#define __BUILD_VERSION" >> .build-version.h
+echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
+echo "#endif" >> .build-version.h
+diff -q version.h .build-version.h >& /dev/null
+if [ $? == 0 ]; then
+    rm .build-version.h
+    exit 0
+fi
+mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..3451e1cca2b5
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3219 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/version.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+struct map_lookup {
+        u64 type;
+        int io_align;
+        int io_width;
+        int stripe_len;
+        int sector_size;
+        int num_stripes;
+        int sub_stripes;
+        struct btrfs_bio_stripe stripes[];
+};
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+                            (sizeof(struct btrfs_bio_stripe) * (n)))
+static DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+void btrfs_lock_volumes(void)
+{
+        mutex_lock(&uuid_mutex);
+}
+void btrfs_unlock_volumes(void)
+{
+        mutex_unlock(&uuid_mutex);
+}
+static void lock_chunks(struct btrfs_root *root)
+{
+        mutex_lock(&root->fs_info->chunk_mutex);
+}
+static void unlock_chunks(struct btrfs_root *root)
+{
+        mutex_unlock(&root->fs_info->chunk_mutex);
+}
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+        struct btrfs_device *device;
+        WARN_ON(fs_devices->opened);
+        while (!list_empty(&fs_devices->devices)) {
+                device = list_entry(fs_devices->devices.next,
+                                    struct btrfs_device, dev_list);
+                list_del(&device->dev_list);
+                kfree(device->name);
+                kfree(device);
+        }
+        kfree(fs_devices);
+}
+int btrfs_cleanup_fs_uuids(void)
+{
+        struct btrfs_fs_devices *fs_devices;
+        while (!list_empty(&fs_uuids)) {
+                fs_devices = list_entry(fs_uuids.next,
+                                        struct btrfs_fs_devices, list);
+                list_del(&fs_devices->list);
+                free_fs_devices(fs_devices);
+        }
+        return 0;
+}
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+                                                   u64 devid, u8 *uuid)
+{
+        struct btrfs_device *dev;
+        struct list_head *cur;
+        list_for_each(cur, head) {
+                dev = list_entry(cur, struct btrfs_device, dev_list);
+                if (dev->devid == devid &&
+                    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
+                        return dev;
+                }
+        }
+        return NULL;
+}
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+        struct list_head *cur;
+        struct btrfs_fs_devices *fs_devices;
+        list_for_each(cur, &fs_uuids) {
+                fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+                if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+                        return fs_devices;
+        }
+        return NULL;
+}
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+static noinline int run_scheduled_bios(struct btrfs_device *device)
+{
+        struct bio *pending;
+        struct backing_dev_info *bdi;
+        struct btrfs_fs_info *fs_info;
+        struct bio *tail;
+        struct bio *cur;
+        int again = 0;
+        unsigned long num_run = 0;
+        unsigned long limit;
+        bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+        fs_info = device->dev_root->fs_info;
+        limit = btrfs_async_submit_limit(fs_info);
+        limit = limit * 2 / 3;
+loop:
+        spin_lock(&device->io_lock);
+        /* take all the bios off the list at once and process them
+         * later on (without the lock held).  But, remember the
+         * tail and other pointers so the bios can be properly reinserted
+         * into the list if we hit congestion
+         */
+        pending = device->pending_bios;
+        tail = device->pending_bio_tail;
+        WARN_ON(pending && !tail);
+        device->pending_bios = NULL;
+        device->pending_bio_tail = NULL;
+        /*
+         * if pending was null this time around, no bios need processing
+         * at all and we can stop.  Otherwise it'll loop back up again
+         * and do an additional check so no bios are missed.
+         *
+         * device->running_pending is used to synchronize with the
+         * schedule_bio code.
+         */
+        if (pending) {
+                again = 1;
+                device->running_pending = 1;
+        } else {
+                again = 0;
+                device->running_pending = 0;
+        }
+        spin_unlock(&device->io_lock);
+        while (pending) {
+                cur = pending;
+                pending = pending->bi_next;
+                cur->bi_next = NULL;
+                atomic_dec(&fs_info->nr_async_bios);
+                if (atomic_read(&fs_info->nr_async_bios) < limit &&
+                    waitqueue_active(&fs_info->async_submit_wait))
+                        wake_up(&fs_info->async_submit_wait);
+                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+                bio_get(cur);
+                submit_bio(cur->bi_rw, cur);
+                bio_put(cur);
+                num_run++;
+                /*
+                 * we made progress, there is more work to do and the bdi
+                 * is now congested.  Back off and let other work structs
+                 * run instead
+                 */
+                if (pending && bdi_write_congested(bdi) &&
+                    fs_info->fs_devices->open_devices > 1) {
+                        struct bio *old_head;
+                        spin_lock(&device->io_lock);
+                        old_head = device->pending_bios;
+                        device->pending_bios = pending;
+                        if (device->pending_bio_tail)
+                                tail->bi_next = old_head;
+                        else
+                                device->pending_bio_tail = tail;
+                        device->running_pending = 0;
+                        spin_unlock(&device->io_lock);
+                        btrfs_requeue_work(&device->work);
+                        goto done;
+                }
+        }
+        if (again)
+                goto loop;
+done:
+        return 0;
+}
+static void pending_bios_fn(struct btrfs_work *work)
+{
+        struct btrfs_device *device;
+        device = container_of(work, struct btrfs_device, work);
+        run_scheduled_bios(device);
+}
+static noinline int device_list_add(const char *path,
+                           struct btrfs_super_block *disk_super,
+                           u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *fs_devices;
+        u64 found_transid = btrfs_super_generation(disk_super);
+        fs_devices = find_fsid(disk_super->fsid);
+        if (!fs_devices) {
+                fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+                if (!fs_devices)
+                        return -ENOMEM;
+                INIT_LIST_HEAD(&fs_devices->devices);
+                INIT_LIST_HEAD(&fs_devices->alloc_list);
+                list_add(&fs_devices->list, &fs_uuids);
+                memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+                fs_devices->latest_devid = devid;
+                fs_devices->latest_trans = found_transid;
+                device = NULL;
+        } else {
+                device = __find_device(&fs_devices->devices, devid,
+                                       disk_super->dev_item.uuid);
+        }
+        if (!device) {
+                if (fs_devices->opened)
+                        return -EBUSY;
+                device = kzalloc(sizeof(*device), GFP_NOFS);
+                if (!device) {
+                        /* we can safely leave the fs_devices entry around */
+                        return -ENOMEM;
+                }
+                device->devid = devid;
+                device->work.func = pending_bios_fn;
+                memcpy(device->uuid, disk_super->dev_item.uuid,
+                       BTRFS_UUID_SIZE);
+                device->barriers = 1;
+                spin_lock_init(&device->io_lock);
+                device->name = kstrdup(path, GFP_NOFS);
+                if (!device->name) {
+                        kfree(device);
+                        return -ENOMEM;
+                }
+                INIT_LIST_HEAD(&device->dev_alloc_list);
+                list_add(&device->dev_list, &fs_devices->devices);
+                device->fs_devices = fs_devices;
+                fs_devices->num_devices++;
+        }
+        if (found_transid > fs_devices->latest_trans) {
+                fs_devices->latest_devid = devid;
+                fs_devices->latest_trans = found_transid;
+        }
+        *fs_devices_ret = fs_devices;
+        return 0;
+}
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+        struct btrfs_fs_devices *fs_devices;
+        struct btrfs_device *device;
+        struct btrfs_device *orig_dev;
+        fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+        if (!fs_devices)
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&fs_devices->devices);
+        INIT_LIST_HEAD(&fs_devices->alloc_list);
+        INIT_LIST_HEAD(&fs_devices->list);
+        fs_devices->latest_devid = orig->latest_devid;
+        fs_devices->latest_trans = orig->latest_trans;
+        memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+                device = kzalloc(sizeof(*device), GFP_NOFS);
+                if (!device)
+                        goto error;
+                device->name = kstrdup(orig_dev->name, GFP_NOFS);
+                if (!device->name)
+                        goto error;
+                device->devid = orig_dev->devid;
+                device->work.func = pending_bios_fn;
+                memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+                device->barriers = 1;
+                spin_lock_init(&device->io_lock);
+                INIT_LIST_HEAD(&device->dev_list);
+                INIT_LIST_HEAD(&device->dev_alloc_list);
+                list_add(&device->dev_list, &fs_devices->devices);
+                device->fs_devices = fs_devices;
+                fs_devices->num_devices++;
+        }
+        return fs_devices;
+error:
+        free_fs_devices(fs_devices);
+        return ERR_PTR(-ENOMEM);
+}
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+        struct list_head *tmp;
+        struct list_head *cur;
+        struct btrfs_device *device;
+        mutex_lock(&uuid_mutex);
+again:
+        list_for_each_safe(cur, tmp, &fs_devices->devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (device->in_fs_metadata)
+                        continue;
+                if (device->bdev) {
+                        close_bdev_exclusive(device->bdev, device->mode);
+                        device->bdev = NULL;
+                        fs_devices->open_devices--;
+                }
+                if (device->writeable) {
+                        list_del_init(&device->dev_alloc_list);
+                        device->writeable = 0;
+                        fs_devices->rw_devices--;
+                }
+                list_del_init(&device->dev_list);
+                fs_devices->num_devices--;
+                kfree(device->name);
+                kfree(device);
+        }
+        if (fs_devices->seed) {
+                fs_devices = fs_devices->seed;
+                goto again;
+        }
+        mutex_unlock(&uuid_mutex);
+        return 0;
+}
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+        struct list_head *cur;
+        struct btrfs_device *device;
+        if (--fs_devices->opened > 0)
+                return 0;
+        list_for_each(cur, &fs_devices->devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (device->bdev) {
+                        close_bdev_exclusive(device->bdev, device->mode);
+                        fs_devices->open_devices--;
+                }
+                if (device->writeable) {
+                        list_del_init(&device->dev_alloc_list);
+                        fs_devices->rw_devices--;
+                }
+                device->bdev = NULL;
+                device->writeable = 0;
+                device->in_fs_metadata = 0;
+        }
+        WARN_ON(fs_devices->open_devices);
+        WARN_ON(fs_devices->rw_devices);
+        fs_devices->opened = 0;
+        fs_devices->seeding = 0;
+        return 0;
+}
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+        struct btrfs_fs_devices *seed_devices = NULL;
+        int ret;
+        mutex_lock(&uuid_mutex);
+        ret = __btrfs_close_devices(fs_devices);
+        if (!fs_devices->opened) {
+                seed_devices = fs_devices->seed;
+                fs_devices->seed = NULL;
+        }
+        mutex_unlock(&uuid_mutex);
+        while (seed_devices) {
+                fs_devices = seed_devices;
+                seed_devices = fs_devices->seed;
+                __btrfs_close_devices(fs_devices);
+                free_fs_devices(fs_devices);
+        }
+        return ret;
+}
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+                                fmode_t flags, void *holder)
+{
+        struct block_device *bdev;
+        struct list_head *head = &fs_devices->devices;
+        struct list_head *cur;
+        struct btrfs_device *device;
+        struct block_device *latest_bdev = NULL;
+        struct buffer_head *bh;
+        struct btrfs_super_block *disk_super;
+        u64 latest_devid = 0;
+        u64 latest_transid = 0;
+        u64 devid;
+        int seeding = 1;
+        int ret = 0;
+        list_for_each(cur, head) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (device->bdev)
+                        continue;
+                if (!device->name)
+                        continue;
+                bdev = open_bdev_exclusive(device->name, flags, holder);
+                if (IS_ERR(bdev)) {
+                        printk(KERN_INFO "open %s failed\n", device->name);
+                        goto error;
+                }
+                set_blocksize(bdev, 4096);
+                bh = btrfs_read_dev_super(bdev);
+                if (!bh)
+                        goto error_close;
+                disk_super = (struct btrfs_super_block *)bh->b_data;
+                devid = le64_to_cpu(disk_super->dev_item.devid);
+                if (devid != device->devid)
+                        goto error_brelse;
+                if (memcmp(device->uuid, disk_super->dev_item.uuid,
+                           BTRFS_UUID_SIZE))
+                        goto error_brelse;
+                device->generation = btrfs_super_generation(disk_super);
+                if (!latest_transid || device->generation > latest_transid) {
+                        latest_devid = devid;
+                        latest_transid = device->generation;
+                        latest_bdev = bdev;
+                }
+                if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+                        device->writeable = 0;
+                } else {
+                        device->writeable = !bdev_read_only(bdev);
+                        seeding = 0;
+                }
+                device->bdev = bdev;
+                device->in_fs_metadata = 0;
+                device->mode = flags;
+                fs_devices->open_devices++;
+                if (device->writeable) {
+                        fs_devices->rw_devices++;
+                        list_add(&device->dev_alloc_list,
+                                 &fs_devices->alloc_list);
+                }
+                continue;
+error_brelse:
+                brelse(bh);
+error_close:
+                close_bdev_exclusive(bdev, FMODE_READ);
+error:
+                continue;
+        }
+        if (fs_devices->open_devices == 0) {
+                ret = -EIO;
+                goto out;
+        }
+        fs_devices->seeding = seeding;
+        fs_devices->opened = 1;
+        fs_devices->latest_bdev = latest_bdev;
+        fs_devices->latest_devid = latest_devid;
+        fs_devices->latest_trans = latest_transid;
+        fs_devices->total_rw_bytes = 0;
+out:
+        return ret;
+}
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+                       fmode_t flags, void *holder)
+{
+        int ret;
+        mutex_lock(&uuid_mutex);
+        if (fs_devices->opened) {
+                fs_devices->opened++;
+                ret = 0;
+        } else {
+                ret = __btrfs_open_devices(fs_devices, flags, holder);
+        }
+        mutex_unlock(&uuid_mutex);
+        return ret;
+}
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+                          struct btrfs_fs_devices **fs_devices_ret)
+{
+        struct btrfs_super_block *disk_super;
+        struct block_device *bdev;
+        struct buffer_head *bh;
+        int ret;
+        u64 devid;
+        u64 transid;
+        mutex_lock(&uuid_mutex);
+        bdev = open_bdev_exclusive(path, flags, holder);
+        if (IS_ERR(bdev)) {
+                ret = PTR_ERR(bdev);
+                goto error;
+        }
+        ret = set_blocksize(bdev, 4096);
+        if (ret)
+                goto error_close;
+        bh = btrfs_read_dev_super(bdev);
+        if (!bh) {
+                ret = -EIO;
+                goto error_close;
+        }
+        disk_super = (struct btrfs_super_block *)bh->b_data;
+        devid = le64_to_cpu(disk_super->dev_item.devid);
+        transid = btrfs_super_generation(disk_super);
+        if (disk_super->label[0])
+                printk(KERN_INFO "device label %s ", disk_super->label);
+        else {
+                /* FIXME, make a readl uuid parser */
+                printk(KERN_INFO "device fsid %llx-%llx ",
+                       *(unsigned long long *)disk_super->fsid,
+                       *(unsigned long long *)(disk_super->fsid + 8));
+        }
+        printk(KERN_INFO "devid %llu transid %llu %s\n",
+               (unsigned long long)devid, (unsigned long long)transid, path);
+        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+        brelse(bh);
+error_close:
+        close_bdev_exclusive(bdev, flags);
+error:
+        mutex_unlock(&uuid_mutex);
+        return ret;
+}
+/*
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                                         struct btrfs_device *device,
+                                         u64 num_bytes, u64 *start)
+{
+        struct btrfs_key key;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_dev_extent *dev_extent = NULL;
+        struct btrfs_path *path;
+        u64 hole_size = 0;
+        u64 last_byte = 0;
+        u64 search_start = 0;
+        u64 search_end = device->total_bytes;
+        int ret;
+        int slot = 0;
+        int start_found;
+        struct extent_buffer *l;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        start_found = 0;
+        /* FIXME use last free of some kind */
+        /* we don't want to overwrite the superblock on the drive,
+         * so we make sure to start at an offset of at least 1MB
+         */
+        search_start = max((u64)1024 * 1024, search_start);
+        if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+                search_start = max(root->fs_info->alloc_start, search_start);
+        key.objectid = device->devid;
+        key.offset = search_start;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        ret = btrfs_previous_item(root, path, 0, key.type);
+        if (ret < 0)
+                goto error;
+        l = path->nodes[0];
+        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+        while (1) {
+                l = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(l)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto error;
+no_more_items:
+                        if (!start_found) {
+                                if (search_start >= search_end) {
+                                        ret = -ENOSPC;
+                                        goto error;
+                                }
+                                *start = search_start;
+                                start_found = 1;
+                                goto check_pending;
+                        }
+                        *start = last_byte > search_start ?
+                                last_byte : search_start;
+                        if (search_end <= *start) {
+                                ret = -ENOSPC;
+                                goto error;
+                        }
+                        goto check_pending;
+                }
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid < device->devid)
+                        goto next;
+                if (key.objectid > device->devid)
+                        goto no_more_items;
+                if (key.offset >= search_start && key.offset > last_byte &&
+                    start_found) {
+                        if (last_byte < search_start)
+                                last_byte = search_start;
+                        hole_size = key.offset - last_byte;
+                        if (key.offset > last_byte &&
+                            hole_size >= num_bytes) {
+                                *start = last_byte;
+                                goto check_pending;
+                        }
+                }
+                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                        goto next;
+                start_found = 1;
+                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+                last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+next:
+                path->slots[0]++;
+                cond_resched();
+        }
+check_pending:
+        /* we have to make sure we didn't find an extent that has already
+         * been allocated by the map tree or the original allocation
+         */
+        BUG_ON(*start < search_start);
+        if (*start + num_bytes > search_end) {
+                ret = -ENOSPC;
+                goto error;
+        }
+        /* check for pending inserts here */
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+                          struct btrfs_device *device,
+                          u64 start)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf = NULL;
+        struct btrfs_dev_extent *extent = NULL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = device->devid;
+        key.offset = start;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0) {
+                ret = btrfs_previous_item(root, path, key.objectid,
+                                          BTRFS_DEV_EXTENT_KEY);
+                BUG_ON(ret);
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                extent = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_dev_extent);
+                BUG_ON(found_key.offset > start || found_key.offset +
+                       btrfs_dev_extent_length(leaf, extent) < start);
+                ret = 0;
+        } else if (ret == 0) {
+                leaf = path->nodes[0];
+                extent = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_dev_extent);
+        }
+        BUG_ON(ret);
+        if (device->bytes_used > 0)
+                device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+        ret = btrfs_del_item(trans, root, path);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+                           struct btrfs_device *device,
+                           u64 chunk_tree, u64 chunk_objectid,
+                           u64 chunk_offset, u64 start, u64 num_bytes)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_dev_extent *extent;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        WARN_ON(!device->in_fs_metadata);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = device->devid;
+        key.offset = start;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      sizeof(*extent));
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        extent = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_dev_extent);
+        btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+        btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+        btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+        write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+                    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
+                    BTRFS_UUID_SIZE);
+        btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        return ret;
+}
+static noinline int find_next_chunk(struct btrfs_root *root,
+                                    u64 objectid, u64 *offset)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_chunk *chunk;
+        struct btrfs_key found_key;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = objectid;
+        key.offset = (u64)-1;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        BUG_ON(ret == 0);
+        ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+        if (ret) {
+                *offset = 0;
+        } else {
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                if (found_key.objectid != objectid)
+                        *offset = 0;
+                else {
+                        chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                               struct btrfs_chunk);
+                        *offset = found_key.offset +
+                                btrfs_chunk_length(path->nodes[0], chunk);
+                }
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_path *path;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        key.offset = (u64)-1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        BUG_ON(ret == 0);
+        ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+                                  BTRFS_DEV_ITEM_KEY);
+        if (ret) {
+                *objectid = 1;
+        } else {
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                *objectid = found_key.offset + 1;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_device *device)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_dev_item *dev_item;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        unsigned long ptr;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        key.offset = device->devid;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      sizeof(*dev_item));
+        if (ret)
+                goto out;
+        leaf = path->nodes[0];
+        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+        btrfs_set_device_id(leaf, dev_item, device->devid);
+        btrfs_set_device_generation(leaf, dev_item, 0);
+        btrfs_set_device_type(leaf, dev_item, device->type);
+        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+        btrfs_set_device_group(leaf, dev_item, 0);
+        btrfs_set_device_seek_speed(leaf, dev_item, 0);
+        btrfs_set_device_bandwidth(leaf, dev_item, 0);
+        btrfs_set_device_start_offset(leaf, dev_item, 0);
+        ptr = (unsigned long)btrfs_device_uuid(dev_item);
+        write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+        ptr = (unsigned long)btrfs_device_fsid(dev_item);
+        write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+                             struct btrfs_device *device)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_trans_handle *trans;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        trans = btrfs_start_transaction(root, 1);
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        key.offset = device->devid;
+        lock_chunks(root);
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        ret = btrfs_del_item(trans, root, path);
+        if (ret)
+                goto out;
+out:
+        btrfs_free_path(path);
+        unlock_chunks(root);
+        btrfs_commit_transaction(trans, root);
+        return ret;
+}
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+        struct btrfs_device *device;
+        struct btrfs_device *next_device;
+        struct block_device *bdev;
+        struct buffer_head *bh = NULL;
+        struct btrfs_super_block *disk_super;
+        u64 all_avail;
+        u64 devid;
+        u64 num_devices;
+        u8 *dev_uuid;
+        int ret = 0;
+        mutex_lock(&uuid_mutex);
+        mutex_lock(&root->fs_info->volume_mutex);
+        all_avail = root->fs_info->avail_data_alloc_bits |
+                root->fs_info->avail_system_alloc_bits |
+                root->fs_info->avail_metadata_alloc_bits;
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+            root->fs_info->fs_devices->rw_devices <= 4) {
+                printk(KERN_ERR "btrfs: unable to go below four devices "
+                       "on raid10\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+            root->fs_info->fs_devices->rw_devices <= 2) {
+                printk(KERN_ERR "btrfs: unable to go below two "
+                       "devices on raid1\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        if (strcmp(device_path, "missing") == 0) {
+                struct list_head *cur;
+                struct list_head *devices;
+                struct btrfs_device *tmp;
+                device = NULL;
+                devices = &root->fs_info->fs_devices->devices;
+                list_for_each(cur, devices) {
+                        tmp = list_entry(cur, struct btrfs_device, dev_list);
+                        if (tmp->in_fs_metadata && !tmp->bdev) {
+                                device = tmp;
+                                break;
+                        }
+                }
+                bdev = NULL;
+                bh = NULL;
+                disk_super = NULL;
+                if (!device) {
+                        printk(KERN_ERR "btrfs: no missing devices found to "
+                               "remove\n");
+                        goto out;
+                }
+        } else {
+                bdev = open_bdev_exclusive(device_path, FMODE_READ,
+                                      root->fs_info->bdev_holder);
+                if (IS_ERR(bdev)) {
+                        ret = PTR_ERR(bdev);
+                        goto out;
+                }
+                set_blocksize(bdev, 4096);
+                bh = btrfs_read_dev_super(bdev);
+                if (!bh) {
+                        ret = -EIO;
+                        goto error_close;
+                }
+                disk_super = (struct btrfs_super_block *)bh->b_data;
+                devid = le64_to_cpu(disk_super->dev_item.devid);
+                dev_uuid = disk_super->dev_item.uuid;
+                device = btrfs_find_device(root, devid, dev_uuid,
+                                           disk_super->fsid);
+                if (!device) {
+                        ret = -ENOENT;
+                        goto error_brelse;
+                }
+        }
+        if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+                printk(KERN_ERR "btrfs: unable to remove the only writeable "
+                       "device\n");
+                ret = -EINVAL;
+                goto error_brelse;
+        }
+        if (device->writeable) {
+                list_del_init(&device->dev_alloc_list);
+                root->fs_info->fs_devices->rw_devices--;
+        }
+        ret = btrfs_shrink_device(device, 0);
+        if (ret)
+                goto error_brelse;
+        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+        if (ret)
+                goto error_brelse;
+        device->in_fs_metadata = 0;
+        list_del_init(&device->dev_list);
+        device->fs_devices->num_devices--;
+        next_device = list_entry(root->fs_info->fs_devices->devices.next,
+                                 struct btrfs_device, dev_list);
+        if (device->bdev == root->fs_info->sb->s_bdev)
+                root->fs_info->sb->s_bdev = next_device->bdev;
+        if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+        if (device->bdev) {
+                close_bdev_exclusive(device->bdev, device->mode);
+                device->bdev = NULL;
+                device->fs_devices->open_devices--;
+        }
+        num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+        btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+        if (device->fs_devices->open_devices == 0) {
+                struct btrfs_fs_devices *fs_devices;
+                fs_devices = root->fs_info->fs_devices;
+                while (fs_devices) {
+                        if (fs_devices->seed == device->fs_devices)
+                                break;
+                        fs_devices = fs_devices->seed;
+                }
+                fs_devices->seed = device->fs_devices->seed;
+                device->fs_devices->seed = NULL;
+                __btrfs_close_devices(device->fs_devices);
+                free_fs_devices(device->fs_devices);
+        }
+        /*
+         * at this point, the device is zero sized.  We want to
+         * remove it from the devices list and zero out the old super
+         */
+        if (device->writeable) {
+                /* make sure this device isn't detected as part of
+                 * the FS anymore
+                 */
+                memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+                set_buffer_dirty(bh);
+                sync_dirty_buffer(bh);
+        }
+        kfree(device->name);
+        kfree(device);
+        ret = 0;
+error_brelse:
+        brelse(bh);
+error_close:
+        if (bdev)
+                close_bdev_exclusive(bdev, FMODE_READ);
+out:
+        mutex_unlock(&root->fs_info->volume_mutex);
+        mutex_unlock(&uuid_mutex);
+        return ret;
+}
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root)
+{
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        struct btrfs_fs_devices *old_devices;
+        struct btrfs_fs_devices *seed_devices;
+        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+        struct btrfs_device *device;
+        u64 super_flags;
+        BUG_ON(!mutex_is_locked(&uuid_mutex));
+        if (!fs_devices->seeding)
+                return -EINVAL;
+        seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+        if (!seed_devices)
+                return -ENOMEM;
+        old_devices = clone_fs_devices(fs_devices);
+        if (IS_ERR(old_devices)) {
+                kfree(seed_devices);
+                return PTR_ERR(old_devices);
+        }
+        list_add(&old_devices->list, &fs_uuids);
+        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+        seed_devices->opened = 1;
+        INIT_LIST_HEAD(&seed_devices->devices);
+        INIT_LIST_HEAD(&seed_devices->alloc_list);
+        list_splice_init(&fs_devices->devices, &seed_devices->devices);
+        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+        list_for_each_entry(device, &seed_devices->devices, dev_list) {
+                device->fs_devices = seed_devices;
+        }
+        fs_devices->seeding = 0;
+        fs_devices->num_devices = 0;
+        fs_devices->open_devices = 0;
+        fs_devices->seed = seed_devices;
+        generate_random_uuid(fs_devices->fsid);
+        memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+        memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+        super_flags = btrfs_super_flags(disk_super) &
+                      ~BTRFS_SUPER_FLAG_SEEDING;
+        btrfs_set_super_flags(disk_super, super_flags);
+        return 0;
+}
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_dev_item *dev_item;
+        struct btrfs_device *device;
+        struct btrfs_key key;
+        u8 fs_uuid[BTRFS_UUID_SIZE];
+        u8 dev_uuid[BTRFS_UUID_SIZE];
+        u64 devid;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        root = root->fs_info->chunk_root;
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.offset = 0;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        while (1) {
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                if (ret < 0)
+                        goto error;
+                leaf = path->nodes[0];
+next_slot:
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret > 0)
+                                break;
+                        if (ret < 0)
+                                goto error;
+                        leaf = path->nodes[0];
+                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                        btrfs_release_path(root, path);
+                        continue;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+                    key.type != BTRFS_DEV_ITEM_KEY)
+                        break;
+                dev_item = btrfs_item_ptr(leaf, path->slots[0],
+                                          struct btrfs_dev_item);
+                devid = btrfs_device_id(leaf, dev_item);
+                read_extent_buffer(leaf, dev_uuid,
+                                   (unsigned long)btrfs_device_uuid(dev_item),
+                                   BTRFS_UUID_SIZE);
+                read_extent_buffer(leaf, fs_uuid,
+                                   (unsigned long)btrfs_device_fsid(dev_item),
+                                   BTRFS_UUID_SIZE);
+                device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+                BUG_ON(!device);
+                if (device->fs_devices->seeding) {
+                        btrfs_set_device_generation(leaf, dev_item,
+                                                    device->generation);
+                        btrfs_mark_buffer_dirty(leaf);
+                }
+                path->slots[0]++;
+                goto next_slot;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_device *device;
+        struct block_device *bdev;
+        struct list_head *cur;
+        struct list_head *devices;
+        struct super_block *sb = root->fs_info->sb;
+        u64 total_bytes;
+        int seeding_dev = 0;
+        int ret = 0;
+        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+                return -EINVAL;
+        bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+        if (!bdev)
+                return -EIO;
+        if (root->fs_info->fs_devices->seeding) {
+                seeding_dev = 1;
+                down_write(&sb->s_umount);
+                mutex_lock(&uuid_mutex);
+        }
+        filemap_write_and_wait(bdev->bd_inode->i_mapping);
+        mutex_lock(&root->fs_info->volume_mutex);
+        devices = &root->fs_info->fs_devices->devices;
+        list_for_each(cur, devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (device->bdev == bdev) {
+                        ret = -EEXIST;
+                        goto error;
+                }
+        }
+        device = kzalloc(sizeof(*device), GFP_NOFS);
+        if (!device) {
+                /* we can safely leave the fs_devices entry around */
+                ret = -ENOMEM;
+                goto error;
+        }
+        device->name = kstrdup(device_path, GFP_NOFS);
+        if (!device->name) {
+                kfree(device);
+                ret = -ENOMEM;
+                goto error;
+        }
+        ret = find_next_devid(root, &device->devid);
+        if (ret) {
+                kfree(device);
+                goto error;
+        }
+        trans = btrfs_start_transaction(root, 1);
+        lock_chunks(root);
+        device->barriers = 1;
+        device->writeable = 1;
+        device->work.func = pending_bios_fn;
+        generate_random_uuid(device->uuid);
+        spin_lock_init(&device->io_lock);
+        device->generation = trans->transid;
+        device->io_width = root->sectorsize;
+        device->io_align = root->sectorsize;
+        device->sector_size = root->sectorsize;
+        device->total_bytes = i_size_read(bdev->bd_inode);
+        device->dev_root = root->fs_info->dev_root;
+        device->bdev = bdev;
+        device->in_fs_metadata = 1;
+        device->mode = 0;
+        set_blocksize(device->bdev, 4096);
+        if (seeding_dev) {
+                sb->s_flags &= ~MS_RDONLY;
+                ret = btrfs_prepare_sprout(trans, root);
+                BUG_ON(ret);
+        }
+        device->fs_devices = root->fs_info->fs_devices;
+        list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+        list_add(&device->dev_alloc_list,
+                 &root->fs_info->fs_devices->alloc_list);
+        root->fs_info->fs_devices->num_devices++;
+        root->fs_info->fs_devices->open_devices++;
+        root->fs_info->fs_devices->rw_devices++;
+        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+        total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+                                    total_bytes + device->total_bytes);
+        total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+        btrfs_set_super_num_devices(&root->fs_info->super_copy,
+                                    total_bytes + 1);
+        if (seeding_dev) {
+                ret = init_first_rw_device(trans, root, device);
+                BUG_ON(ret);
+                ret = btrfs_finish_sprout(trans, root);
+                BUG_ON(ret);
+        } else {
+                ret = btrfs_add_device(trans, root, device);
+        }
+        unlock_chunks(root);
+        btrfs_commit_transaction(trans, root);
+        if (seeding_dev) {
+                mutex_unlock(&uuid_mutex);
+                up_write(&sb->s_umount);
+                ret = btrfs_relocate_sys_chunks(root);
+                BUG_ON(ret);
+        }
+out:
+        mutex_unlock(&root->fs_info->volume_mutex);
+        return ret;
+error:
+        close_bdev_exclusive(bdev, 0);
+        if (seeding_dev) {
+                mutex_unlock(&uuid_mutex);
+                up_write(&sb->s_umount);
+        }
+        goto out;
+}
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+                                        struct btrfs_device *device)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_root *root;
+        struct btrfs_dev_item *dev_item;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        root = device->dev_root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        key.offset = device->devid;
+        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+        btrfs_set_device_id(leaf, dev_item, device->devid);
+        btrfs_set_device_type(leaf, dev_item, device->type);
+        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+                      struct btrfs_device *device, u64 new_size)
+{
+        struct btrfs_super_block *super_copy =
+                &device->dev_root->fs_info->super_copy;
+        u64 old_total = btrfs_super_total_bytes(super_copy);
+        u64 diff = new_size - device->total_bytes;
+        if (!device->writeable)
+                return -EACCES;
+        if (new_size <= device->total_bytes)
+                return -EINVAL;
+        btrfs_set_super_total_bytes(super_copy, old_total + diff);
+        device->fs_devices->total_rw_bytes += diff;
+        device->total_bytes = new_size;
+        return btrfs_update_device(trans, device);
+}
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+                      struct btrfs_device *device, u64 new_size)
+{
+        int ret;
+        lock_chunks(device->dev_root);
+        ret = __btrfs_grow_device(trans, device, new_size);
+        unlock_chunks(device->dev_root);
+        return ret;
+}
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            u64 chunk_tree, u64 chunk_objectid,
+                            u64 chunk_offset)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = chunk_objectid;
+        key.offset = chunk_offset;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        BUG_ON(ret);
+        ret = btrfs_del_item(trans, root, path);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return 0;
+}
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+                        chunk_offset)
+{
+        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_disk_key *disk_key;
+        struct btrfs_chunk *chunk;
+        u8 *ptr;
+        int ret = 0;
+        u32 num_stripes;
+        u32 array_size;
+        u32 len = 0;
+        u32 cur;
+        struct btrfs_key key;
+        array_size = btrfs_super_sys_array_size(super_copy);
+        ptr = super_copy->sys_chunk_array;
+        cur = 0;
+        while (cur < array_size) {
+                disk_key = (struct btrfs_disk_key *)ptr;
+                btrfs_disk_key_to_cpu(&key, disk_key);
+                len = sizeof(*disk_key);
+                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+                        chunk = (struct btrfs_chunk *)(ptr + len);
+                        num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+                        len += btrfs_chunk_item_size(num_stripes);
+                } else {
+                        ret = -EIO;
+                        break;
+                }
+                if (key.objectid == chunk_objectid &&
+                    key.offset == chunk_offset) {
+                        memmove(ptr, ptr + len, array_size - (cur + len));
+                        array_size -= len;
+                        btrfs_set_super_sys_array_size(super_copy, array_size);
+                } else {
+                        ptr += len;
+                        cur += len;
+                }
+        }
+        return ret;
+}
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+                         u64 chunk_tree, u64 chunk_objectid,
+                         u64 chunk_offset)
+{
+        struct extent_map_tree *em_tree;
+        struct btrfs_root *extent_root;
+        struct btrfs_trans_handle *trans;
+        struct extent_map *em;
+        struct map_lookup *map;
+        int ret;
+        int i;
+        printk(KERN_INFO "btrfs relocating chunk %llu\n",
+               (unsigned long long)chunk_offset);
+        root = root->fs_info->chunk_root;
+        extent_root = root->fs_info->extent_root;
+        em_tree = &root->fs_info->mapping_tree.map_tree;
+        /* step one, relocate all the extents inside this chunk */
+        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+        BUG_ON(ret);
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        lock_chunks(root);
+        /*
+         * step two, delete the device extents and the
+         * chunk tree entries
+         */
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+        spin_unlock(&em_tree->lock);
+        BUG_ON(em->start > chunk_offset ||
+               em->start + em->len < chunk_offset);
+        map = (struct map_lookup *)em->bdev;
+        for (i = 0; i < map->num_stripes; i++) {
+                ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+                                            map->stripes[i].physical);
+                BUG_ON(ret);
+                if (map->stripes[i].dev) {
+                        ret = btrfs_update_device(trans, map->stripes[i].dev);
+                        BUG_ON(ret);
+                }
+        }
+        ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+                               chunk_offset);
+        BUG_ON(ret);
+        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+                ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+                BUG_ON(ret);
+        }
+        ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+        BUG_ON(ret);
+        spin_lock(&em_tree->lock);
+        remove_extent_mapping(em_tree, em);
+        spin_unlock(&em_tree->lock);
+        kfree(map);
+        em->bdev = NULL;
+        /* once for the tree */
+        free_extent_map(em);
+        /* once for us */
+        free_extent_map(em);
+        unlock_chunks(root);
+        btrfs_end_transaction(trans, root);
+        return 0;
+}
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+        struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_chunk *chunk;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        u64 chunk_tree = chunk_root->root_key.objectid;
+        u64 chunk_type;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+        key.offset = (u64)-1;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        while (1) {
+                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto error;
+                BUG_ON(ret == 0);
+                ret = btrfs_previous_item(chunk_root, path, key.objectid,
+                                          key.type);
+                if (ret < 0)
+                        goto error;
+                if (ret > 0)
+                        break;
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                chunk = btrfs_item_ptr(leaf, path->slots[0],
+                                       struct btrfs_chunk);
+                chunk_type = btrfs_chunk_type(leaf, chunk);
+                btrfs_release_path(chunk_root, path);
+                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+                        ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+                                                   found_key.objectid,
+                                                   found_key.offset);
+                        BUG_ON(ret);
+                }
+                if (found_key.offset == 0)
+                        break;
+                key.offset = found_key.offset - 1;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+static u64 div_factor(u64 num, int factor)
+{
+        if (factor == 10)
+                return num;
+        num *= factor;
+        do_div(num, 10);
+        return num;
+}
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+        int ret;
+        struct list_head *cur;
+        struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+        struct btrfs_device *device;
+        u64 old_size;
+        u64 size_to_free;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_chunk *chunk;
+        struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key found_key;
+        if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        mutex_lock(&dev_root->fs_info->volume_mutex);
+        dev_root = dev_root->fs_info->dev_root;
+        /* step one make some room on all the devices */
+        list_for_each(cur, devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                old_size = device->total_bytes;
+                size_to_free = div_factor(old_size, 1);
+                size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+                if (!device->writeable ||
+                    device->total_bytes - device->bytes_used > size_to_free)
+                        continue;
+                ret = btrfs_shrink_device(device, old_size - size_to_free);
+                BUG_ON(ret);
+                trans = btrfs_start_transaction(dev_root, 1);
+                BUG_ON(!trans);
+                ret = btrfs_grow_device(trans, device, old_size);
+                BUG_ON(ret);
+                btrfs_end_transaction(trans, dev_root);
+        }
+        /* step two, relocate all the chunks */
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+        key.offset = (u64)-1;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        while (1) {
+                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto error;
+                /*
+                 * this shouldn't happen, it means the last relocate
+                 * failed
+                 */
+                if (ret == 0)
+                        break;
+                ret = btrfs_previous_item(chunk_root, path, 0,
+                                          BTRFS_CHUNK_ITEM_KEY);
+                if (ret)
+                        break;
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                if (found_key.objectid != key.objectid)
+                        break;
+                chunk = btrfs_item_ptr(path->nodes[0],
+                                       path->slots[0],
+                                       struct btrfs_chunk);
+                key.offset = found_key.offset;
+                /* chunk zero is special */
+                if (key.offset == 0)
+                        break;
+                btrfs_release_path(chunk_root, path);
+                ret = btrfs_relocate_chunk(chunk_root,
+                                           chunk_root->root_key.objectid,
+                                           found_key.objectid,
+                                           found_key.offset);
+                BUG_ON(ret);
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        mutex_unlock(&dev_root->fs_info->volume_mutex);
+        return ret;
+}
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_dev_extent *dev_extent = NULL;
+        struct btrfs_path *path;
+        u64 length;
+        u64 chunk_tree;
+        u64 chunk_objectid;
+        u64 chunk_offset;
+        int ret;
+        int slot;
+        struct extent_buffer *l;
+        struct btrfs_key key;
+        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        u64 old_total = btrfs_super_total_bytes(super_copy);
+        u64 diff = device->total_bytes - new_size;
+        if (new_size >= device->total_bytes)
+                return -EINVAL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        trans = btrfs_start_transaction(root, 1);
+        if (!trans) {
+                ret = -ENOMEM;
+                goto done;
+        }
+        path->reada = 2;
+        lock_chunks(root);
+        device->total_bytes = new_size;
+        if (device->writeable)
+                device->fs_devices->total_rw_bytes -= diff;
+        ret = btrfs_update_device(trans, device);
+        if (ret) {
+                unlock_chunks(root);
+                btrfs_end_transaction(trans, root);
+                goto done;
+        }
+        WARN_ON(diff > old_total);
+        btrfs_set_super_total_bytes(super_copy, old_total - diff);
+        unlock_chunks(root);
+        btrfs_end_transaction(trans, root);
+        key.objectid = device->devid;
+        key.offset = (u64)-1;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto done;
+                ret = btrfs_previous_item(root, path, 0, key.type);
+                if (ret < 0)
+                        goto done;
+                if (ret) {
+                        ret = 0;
+                        goto done;
+                }
+                l = path->nodes[0];
+                slot = path->slots[0];
+                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+                if (key.objectid != device->devid)
+                        goto done;
+                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+                length = btrfs_dev_extent_length(l, dev_extent);
+                if (key.offset + length <= new_size)
+                        goto done;
+                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+                btrfs_release_path(root, path);
+                ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+                                           chunk_offset);
+                if (ret)
+                        goto done;
+        }
+done:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_key *key,
+                           struct btrfs_chunk *chunk, int item_size)
+{
+        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_disk_key disk_key;
+        u32 array_size;
+        u8 *ptr;
+        array_size = btrfs_super_sys_array_size(super_copy);
+        if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+                return -EFBIG;
+        ptr = super_copy->sys_chunk_array + array_size;
+        btrfs_cpu_key_to_disk(&disk_key, key);
+        memcpy(ptr, &disk_key, sizeof(disk_key));
+        ptr += sizeof(disk_key);
+        memcpy(ptr, chunk, item_size);
+        item_size += sizeof(disk_key);
+        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+        return 0;
+}
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
+                                        int num_stripes, int sub_stripes)
+{
+        if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+                return calc_size;
+        else if (type & BTRFS_BLOCK_GROUP_RAID10)
+                return calc_size * (num_stripes / sub_stripes);
+        else
+                return calc_size * num_stripes;
+}
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root,
+                               struct map_lookup **map_ret,
+                               u64 *num_bytes, u64 *stripe_size,
+                               u64 start, u64 type)
+{
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct btrfs_device *device = NULL;
+        struct btrfs_fs_devices *fs_devices = info->fs_devices;
+        struct list_head *cur;
+        struct map_lookup *map = NULL;
+        struct extent_map_tree *em_tree;
+        struct extent_map *em;
+        struct list_head private_devs;
+        int min_stripe_size = 1 * 1024 * 1024;
+        u64 calc_size = 1024 * 1024 * 1024;
+        u64 max_chunk_size = calc_size;
+        u64 min_free;
+        u64 avail;
+        u64 max_avail = 0;
+        u64 dev_offset;
+        int num_stripes = 1;
+        int min_stripes = 1;
+        int sub_stripes = 0;
+        int looped = 0;
+        int ret;
+        int index;
+        int stripe_len = 64 * 1024;
+        if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+            (type & BTRFS_BLOCK_GROUP_DUP)) {
+                WARN_ON(1);
+                type &= ~BTRFS_BLOCK_GROUP_DUP;
+        }
+        if (list_empty(&fs_devices->alloc_list))
+                return -ENOSPC;
+        if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+                num_stripes = fs_devices->rw_devices;
+                min_stripes = 2;
+        }
+        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+                num_stripes = 2;
+                min_stripes = 2;
+        }
+        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+                num_stripes = min_t(u64, 2, fs_devices->rw_devices);
+                if (num_stripes < 2)
+                        return -ENOSPC;
+                min_stripes = 2;
+        }
+        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+                num_stripes = fs_devices->rw_devices;
+                if (num_stripes < 4)
+                        return -ENOSPC;
+                num_stripes &= ~(u32)1;
+                sub_stripes = 2;
+                min_stripes = 4;
+        }
+        if (type & BTRFS_BLOCK_GROUP_DATA) {
+                max_chunk_size = 10 * calc_size;
+                min_stripe_size = 64 * 1024 * 1024;
+        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+                max_chunk_size = 4 * calc_size;
+                min_stripe_size = 32 * 1024 * 1024;
+        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+                calc_size = 8 * 1024 * 1024;
+                max_chunk_size = calc_size * 2;
+                min_stripe_size = 1 * 1024 * 1024;
+        }
+        /* we don't want a chunk larger than 10% of writeable space */
+        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+                             max_chunk_size);
+again:
+        if (!map || map->num_stripes != num_stripes) {
+                kfree(map);
+                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+                if (!map)
+                        return -ENOMEM;
+                map->num_stripes = num_stripes;
+        }
+        if (calc_size * num_stripes > max_chunk_size) {
+                calc_size = max_chunk_size;
+                do_div(calc_size, num_stripes);
+                do_div(calc_size, stripe_len);
+                calc_size *= stripe_len;
+        }
+        /* we don't want tiny stripes */
+        calc_size = max_t(u64, min_stripe_size, calc_size);
+        do_div(calc_size, stripe_len);
+        calc_size *= stripe_len;
+        cur = fs_devices->alloc_list.next;
+        index = 0;
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                min_free = calc_size * 2;
+        else
+                min_free = calc_size;
+        /*
+         * we add 1MB because we never use the first 1MB of the device, unless
+         * we've looped, then we are likely allocating the maximum amount of
+         * space left already
+         */
+        if (!looped)
+                min_free += 1024 * 1024;
+        INIT_LIST_HEAD(&private_devs);
+        while (index < num_stripes) {
+                device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+                BUG_ON(!device->writeable);
+                if (device->total_bytes > device->bytes_used)
+                        avail = device->total_bytes - device->bytes_used;
+                else
+                        avail = 0;
+                cur = cur->next;
+                if (device->in_fs_metadata && avail >= min_free) {
+                        ret = find_free_dev_extent(trans, device,
+                                                   min_free, &dev_offset);
+                        if (ret == 0) {
+                                list_move_tail(&device->dev_alloc_list,
+                                               &private_devs);
+                                map->stripes[index].dev = device;
+                                map->stripes[index].physical = dev_offset;
+                                index++;
+                                if (type & BTRFS_BLOCK_GROUP_DUP) {
+                                        map->stripes[index].dev = device;
+                                        map->stripes[index].physical =
+                                                dev_offset + calc_size;
+                                        index++;
+                                }
+                        }
+                } else if (device->in_fs_metadata && avail > max_avail)
+                        max_avail = avail;
+                if (cur == &fs_devices->alloc_list)
+                        break;
+        }
+        list_splice(&private_devs, &fs_devices->alloc_list);
+        if (index < num_stripes) {
+                if (index >= min_stripes) {
+                        num_stripes = index;
+                        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+                                num_stripes /= sub_stripes;
+                                num_stripes *= sub_stripes;
+                        }
+                        looped = 1;
+                        goto again;
+                }
+                if (!looped && max_avail > 0) {
+                        looped = 1;
+                        calc_size = max_avail;
+                        goto again;
+                }
+                kfree(map);
+                return -ENOSPC;
+        }
+        map->sector_size = extent_root->sectorsize;
+        map->stripe_len = stripe_len;
+        map->io_align = stripe_len;
+        map->io_width = stripe_len;
+        map->type = type;
+        map->num_stripes = num_stripes;
+        map->sub_stripes = sub_stripes;
+        *map_ret = map;
+        *stripe_size = calc_size;
+        *num_bytes = chunk_bytes_by_type(type, calc_size,
+                                         num_stripes, sub_stripes);
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                kfree(map);
+                return -ENOMEM;
+        }
+        em->bdev = (struct block_device *)map;
+        em->start = start;
+        em->len = *num_bytes;
+        em->block_start = 0;
+        em->block_len = em->len;
+        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+        spin_lock(&em_tree->lock);
+        ret = add_extent_mapping(em_tree, em);
+        spin_unlock(&em_tree->lock);
+        BUG_ON(ret);
+        free_extent_map(em);
+        ret = btrfs_make_block_group(trans, extent_root, 0, type,
+                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                     start, *num_bytes);
+        BUG_ON(ret);
+        index = 0;
+        while (index < map->num_stripes) {
+                device = map->stripes[index].dev;
+                dev_offset = map->stripes[index].physical;
+                ret = btrfs_alloc_dev_extent(trans, device,
+                                info->chunk_root->root_key.objectid,
+                                BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                start, dev_offset, calc_size);
+                BUG_ON(ret);
+                index++;
+        }
+        return 0;
+}
+static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *extent_root,
+                                struct map_lookup *map, u64 chunk_offset,
+                                u64 chunk_size, u64 stripe_size)
+{
+        u64 dev_offset;
+        struct btrfs_key key;
+        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+        struct btrfs_device *device;
+        struct btrfs_chunk *chunk;
+        struct btrfs_stripe *stripe;
+        size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+        int index = 0;
+        int ret;
+        chunk = kzalloc(item_size, GFP_NOFS);
+        if (!chunk)
+                return -ENOMEM;
+        index = 0;
+        while (index < map->num_stripes) {
+                device = map->stripes[index].dev;
+                device->bytes_used += stripe_size;
+                ret = btrfs_update_device(trans, device);
+                BUG_ON(ret);
+                index++;
+        }
+        index = 0;
+        stripe = &chunk->stripe;
+        while (index < map->num_stripes) {
+                device = map->stripes[index].dev;
+                dev_offset = map->stripes[index].physical;
+                btrfs_set_stack_stripe_devid(stripe, device->devid);
+                btrfs_set_stack_stripe_offset(stripe, dev_offset);
+                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+                stripe++;
+                index++;
+        }
+        btrfs_set_stack_chunk_length(chunk, chunk_size);
+        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+        btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+        btrfs_set_stack_chunk_type(chunk, map->type);
+        btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+        btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+        btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+        btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+        btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        key.offset = chunk_offset;
+        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+        BUG_ON(ret);
+        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+                ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+                                             item_size);
+                BUG_ON(ret);
+        }
+        kfree(chunk);
+        return 0;
+}
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *extent_root, u64 type)
+{
+        u64 chunk_offset;
+        u64 chunk_size;
+        u64 stripe_size;
+        struct map_lookup *map;
+        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+        int ret;
+        ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                              &chunk_offset);
+        if (ret)
+                return ret;
+        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+                                  &stripe_size, chunk_offset, type);
+        if (ret)
+                return ret;
+        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+                                   chunk_size, stripe_size);
+        BUG_ON(ret);
+        return 0;
+}
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
+                                         struct btrfs_root *root,
+                                         struct btrfs_device *device)
+{
+        u64 chunk_offset;
+        u64 sys_chunk_offset;
+        u64 chunk_size;
+        u64 sys_chunk_size;
+        u64 stripe_size;
+        u64 sys_stripe_size;
+        u64 alloc_profile;
+        struct map_lookup *map;
+        struct map_lookup *sys_map;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_root *extent_root = fs_info->extent_root;
+        int ret;
+        ret = find_next_chunk(fs_info->chunk_root,
+                              BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+        BUG_ON(ret);
+        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+                        (fs_info->metadata_alloc_profile &
+                         fs_info->avail_metadata_alloc_bits);
+        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+                                  &stripe_size, chunk_offset, alloc_profile);
+        BUG_ON(ret);
+        sys_chunk_offset = chunk_offset + chunk_size;
+        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+                        (fs_info->system_alloc_profile &
+                         fs_info->avail_system_alloc_bits);
+        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+                                  &sys_chunk_size, &sys_stripe_size,
+                                  sys_chunk_offset, alloc_profile);
+        BUG_ON(ret);
+        ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+        BUG_ON(ret);
+        /*
+         * Modifying chunk tree needs allocating new blocks from both
+         * system block group and metadata block group. So we only can
+         * do operations require modifying the chunk tree after both
+         * block groups were created.
+         */
+        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+                                   chunk_size, stripe_size);
+        BUG_ON(ret);
+        ret = __finish_chunk_alloc(trans, extent_root, sys_map,
+                                   sys_chunk_offset, sys_chunk_size,
+                                   sys_stripe_size);
+        BUG_ON(ret);
+        return 0;
+}
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+        struct extent_map *em;
+        struct map_lookup *map;
+        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+        int readonly = 0;
+        int i;
+        spin_lock(&map_tree->map_tree.lock);
+        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+        spin_unlock(&map_tree->map_tree.lock);
+        if (!em)
+                return 1;
+        map = (struct map_lookup *)em->bdev;
+        for (i = 0; i < map->num_stripes; i++) {
+                if (!map->stripes[i].dev->writeable) {
+                        readonly = 1;
+                        break;
+                }
+        }
+        free_extent_map(em);
+        return readonly;
+}
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+        extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+}
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+        struct extent_map *em;
+        while (1) {
+                spin_lock(&tree->map_tree.lock);
+                em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+                if (em)
+                        remove_extent_mapping(&tree->map_tree, em);
+                spin_unlock(&tree->map_tree.lock);
+                if (!em)
+                        break;
+                kfree(em->bdev);
+                /* once for us */
+                free_extent_map(em);
+                /* once for the tree */
+                free_extent_map(em);
+        }
+}
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+        struct extent_map *em;
+        struct map_lookup *map;
+        struct extent_map_tree *em_tree = &map_tree->map_tree;
+        int ret;
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, logical, len);
+        spin_unlock(&em_tree->lock);
+        BUG_ON(!em);
+        BUG_ON(em->start > logical || em->start + em->len < logical);
+        map = (struct map_lookup *)em->bdev;
+        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+                ret = map->num_stripes;
+        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+                ret = map->sub_stripes;
+        else
+                ret = 1;
+        free_extent_map(em);
+        return ret;
+}
+static int find_live_mirror(struct map_lookup *map, int first, int num,
+                            int optimal)
+{
+        int i;
+        if (map->stripes[optimal].dev->bdev)
+                return optimal;
+        for (i = first; i < first + num; i++) {
+                if (map->stripes[i].dev->bdev)
+                        return i;
+        }
+        /* we couldn't find one that doesn't fail.  Just return something
+         * and the io error handling code will clean up eventually
+         */
+        return optimal;
+}
+static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+                             u64 logical, u64 *length,
+                             struct btrfs_multi_bio **multi_ret,
+                             int mirror_num, struct page *unplug_page)
+{
+        struct extent_map *em;
+        struct map_lookup *map;
+        struct extent_map_tree *em_tree = &map_tree->map_tree;
+        u64 offset;
+        u64 stripe_offset;
+        u64 stripe_nr;
+        int stripes_allocated = 8;
+        int stripes_required = 1;
+        int stripe_index;
+        int i;
+        int num_stripes;
+        int max_errors = 0;
+        struct btrfs_multi_bio *multi = NULL;
+        if (multi_ret && !(rw & (1 << BIO_RW)))
+                stripes_allocated = 1;
+again:
+        if (multi_ret) {
+                multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+                                GFP_NOFS);
+                if (!multi)
+                        return -ENOMEM;
+                atomic_set(&multi->error, 0);
+        }
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, logical, *length);
+        spin_unlock(&em_tree->lock);
+        if (!em && unplug_page)
+                return 0;
+        if (!em) {
+                printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+                       (unsigned long long)logical,
+                       (unsigned long long)*length);
+                BUG();
+        }
+        BUG_ON(em->start > logical || em->start + em->len < logical);
+        map = (struct map_lookup *)em->bdev;
+        offset = logical - em->start;
+        if (mirror_num > map->num_stripes)
+                mirror_num = 0;
+        /* if our multi bio struct is too small, back off and try again */
+        if (rw & (1 << BIO_RW)) {
+                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+                                 BTRFS_BLOCK_GROUP_DUP)) {
+                        stripes_required = map->num_stripes;
+                        max_errors = 1;
+                } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                        stripes_required = map->sub_stripes;
+                        max_errors = 1;
+                }
+        }
+        if (multi_ret && rw == WRITE &&
+            stripes_allocated < stripes_required) {
+                stripes_allocated = map->num_stripes;
+                free_extent_map(em);
+                kfree(multi);
+                goto again;
+        }
+        stripe_nr = offset;
+        /*
+         * stripe_nr counts the total number of stripes we have to stride
+         * to get to this block
+         */
+        do_div(stripe_nr, map->stripe_len);
+        stripe_offset = stripe_nr * map->stripe_len;
+        BUG_ON(offset < stripe_offset);
+        /* stripe_offset is the offset of this block in its stripe*/
+        stripe_offset = offset - stripe_offset;
+        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+                         BTRFS_BLOCK_GROUP_RAID10 |
+                         BTRFS_BLOCK_GROUP_DUP)) {
+                /* we limit the length of each bio to what fits in a stripe */
+                *length = min_t(u64, em->len - offset,
+                              map->stripe_len - stripe_offset);
+        } else {
+                *length = em->len - offset;
+        }
+        if (!multi_ret && !unplug_page)
+                goto out;
+        num_stripes = 1;
+        stripe_index = 0;
+        if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+                if (unplug_page || (rw & (1 << BIO_RW)))
+                        num_stripes = map->num_stripes;
+                else if (mirror_num)
+                        stripe_index = mirror_num - 1;
+                else {
+                        stripe_index = find_live_mirror(map, 0,
+                                            map->num_stripes,
+                                            current->pid % map->num_stripes);
+                }
+        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+                if (rw & (1 << BIO_RW))
+                        num_stripes = map->num_stripes;
+                else if (mirror_num)
+                        stripe_index = mirror_num - 1;
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                int factor = map->num_stripes / map->sub_stripes;
+                stripe_index = do_div(stripe_nr, factor);
+                stripe_index *= map->sub_stripes;
+                if (unplug_page || (rw & (1 << BIO_RW)))
+                        num_stripes = map->sub_stripes;
+                else if (mirror_num)
+                        stripe_index += mirror_num - 1;
+                else {
+                        stripe_index = find_live_mirror(map, stripe_index,
+                                              map->sub_stripes, stripe_index +
+                                              current->pid % map->sub_stripes);
+                }
+        } else {
+                /*
+                 * after this do_div call, stripe_nr is the number of stripes
+                 * on this device we have to walk to find the data, and
+                 * stripe_index is the number of our device in the stripe array
+                 */
+                stripe_index = do_div(stripe_nr, map->num_stripes);
+        }
+        BUG_ON(stripe_index >= map->num_stripes);
+        for (i = 0; i < num_stripes; i++) {
+                if (unplug_page) {
+                        struct btrfs_device *device;
+                        struct backing_dev_info *bdi;
+                        device = map->stripes[stripe_index].dev;
+                        if (device->bdev) {
+                                bdi = blk_get_backing_dev_info(device->bdev);
+                                if (bdi->unplug_io_fn)
+                                        bdi->unplug_io_fn(bdi, unplug_page);
+                        }
+                } else {
+                        multi->stripes[i].physical =
+                                map->stripes[stripe_index].physical +
+                                stripe_offset + stripe_nr * map->stripe_len;
+                        multi->stripes[i].dev = map->stripes[stripe_index].dev;
+                }
+                stripe_index++;
+        }
+        if (multi_ret) {
+                *multi_ret = multi;
+                multi->num_stripes = num_stripes;
+                multi->max_errors = max_errors;
+        }
+out:
+        free_extent_map(em);
+        return 0;
+}
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+                      u64 logical, u64 *length,
+                      struct btrfs_multi_bio **multi_ret, int mirror_num)
+{
+        return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+                                 mirror_num, NULL);
+}
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+                     u64 chunk_start, u64 physical, u64 devid,
+                     u64 **logical, int *naddrs, int *stripe_len)
+{
+        struct extent_map_tree *em_tree = &map_tree->map_tree;
+        struct extent_map *em;
+        struct map_lookup *map;
+        u64 *buf;
+        u64 bytenr;
+        u64 length;
+        u64 stripe_nr;
+        int i, j, nr = 0;
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, chunk_start, 1);
+        spin_unlock(&em_tree->lock);
+        BUG_ON(!em || em->start != chunk_start);
+        map = (struct map_lookup *)em->bdev;
+        length = em->len;
+        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+                do_div(length, map->num_stripes / map->sub_stripes);
+        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+                do_div(length, map->num_stripes);
+        buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+        BUG_ON(!buf);
+        for (i = 0; i < map->num_stripes; i++) {
+                if (devid && map->stripes[i].dev->devid != devid)
+                        continue;
+                if (map->stripes[i].physical > physical ||
+                    map->stripes[i].physical + length <= physical)
+                        continue;
+                stripe_nr = physical - map->stripes[i].physical;
+                do_div(stripe_nr, map->stripe_len);
+                if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                        stripe_nr = stripe_nr * map->num_stripes + i;
+                        do_div(stripe_nr, map->sub_stripes);
+                } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                        stripe_nr = stripe_nr * map->num_stripes + i;
+                }
+                bytenr = chunk_start + stripe_nr * map->stripe_len;
+                WARN_ON(nr >= map->num_stripes);
+                for (j = 0; j < nr; j++) {
+                        if (buf[j] == bytenr)
+                                break;
+                }
+                if (j == nr) {
+                        WARN_ON(nr >= map->num_stripes);
+                        buf[nr++] = bytenr;
+                }
+        }
+        for (i = 0; i > nr; i++) {
+                struct btrfs_multi_bio *multi;
+                struct btrfs_bio_stripe *stripe;
+                int ret;
+                length = 1;
+                ret = btrfs_map_block(map_tree, WRITE, buf[i],
+                                      &length, &multi, 0);
+                BUG_ON(ret);
+                stripe = multi->stripes;
+                for (j = 0; j < multi->num_stripes; j++) {
+                        if (stripe->physical >= physical &&
+                            physical < stripe->physical + length)
+                                break;
+                }
+                BUG_ON(j >= multi->num_stripes);
+                kfree(multi);
+        }
+        *logical = buf;
+        *naddrs = nr;
+        *stripe_len = map->stripe_len;
+        free_extent_map(em);
+        return 0;
+}
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+                      u64 logical, struct page *page)
+{
+        u64 length = PAGE_CACHE_SIZE;
+        return __btrfs_map_block(map_tree, READ, logical, &length,
+                                 NULL, 0, page);
+}
+static void end_bio_multi_stripe(struct bio *bio, int err)
+{
+        struct btrfs_multi_bio *multi = bio->bi_private;
+        int is_orig_bio = 0;
+        if (err)
+                atomic_inc(&multi->error);
+        if (bio == multi->orig_bio)
+                is_orig_bio = 1;
+        if (atomic_dec_and_test(&multi->stripes_pending)) {
+                if (!is_orig_bio) {
+                        bio_put(bio);
+                        bio = multi->orig_bio;
+                }
+                bio->bi_private = multi->private;
+                bio->bi_end_io = multi->end_io;
+                /* only send an error to the higher layers if it is
+                 * beyond the tolerance of the multi-bio
+                 */
+                if (atomic_read(&multi->error) > multi->max_errors) {
+                        err = -EIO;
+                } else if (err) {
+                        /*
+                         * this bio is actually up to date, we didn't
+                         * go over the max number of errors
+                         */
+                        set_bit(BIO_UPTODATE, &bio->bi_flags);
+                        err = 0;
+                }
+                kfree(multi);
+                bio_endio(bio, err);
+        } else if (!is_orig_bio) {
+                bio_put(bio);
+        }
+}
+struct async_sched {
+        struct bio *bio;
+        int rw;
+        struct btrfs_fs_info *info;
+        struct btrfs_work work;
+};
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+static noinline int schedule_bio(struct btrfs_root *root,
+                                 struct btrfs_device *device,
+                                 int rw, struct bio *bio)
+{
+        int should_queue = 1;
+        /* don't bother with additional async steps for reads, right now */
+        if (!(rw & (1 << BIO_RW))) {
+                bio_get(bio);
+                submit_bio(rw, bio);
+                bio_put(bio);
+                return 0;
+        }
+        /*
+         * nr_async_bios allows us to reliably return congestion to the
+         * higher layers.  Otherwise, the async bio makes it appear we have
+         * made progress against dirty pages when we've really just put it
+         * on a queue for later
+         */
+        atomic_inc(&root->fs_info->nr_async_bios);
+        WARN_ON(bio->bi_next);
+        bio->bi_next = NULL;
+        bio->bi_rw |= rw;
+        spin_lock(&device->io_lock);
+        if (device->pending_bio_tail)
+                device->pending_bio_tail->bi_next = bio;
+        device->pending_bio_tail = bio;
+        if (!device->pending_bios)
+                device->pending_bios = bio;
+        if (device->running_pending)
+                should_queue = 0;
+        spin_unlock(&device->io_lock);
+        if (should_queue)
+                btrfs_queue_worker(&root->fs_info->submit_workers,
+                                   &device->work);
+        return 0;
+}
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+                  int mirror_num, int async_submit)
+{
+        struct btrfs_mapping_tree *map_tree;
+        struct btrfs_device *dev;
+        struct bio *first_bio = bio;
+        u64 logical = (u64)bio->bi_sector << 9;
+        u64 length = 0;
+        u64 map_length;
+        struct btrfs_multi_bio *multi = NULL;
+        int ret;
+        int dev_nr = 0;
+        int total_devs = 1;
+        length = bio->bi_size;
+        map_tree = &root->fs_info->mapping_tree;
+        map_length = length;
+        ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+                              mirror_num);
+        BUG_ON(ret);
+        total_devs = multi->num_stripes;
+        if (map_length < length) {
+                printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+                       "len %llu\n", (unsigned long long)logical,
+                       (unsigned long long)length,
+                       (unsigned long long)map_length);
+                BUG();
+        }
+        multi->end_io = first_bio->bi_end_io;
+        multi->private = first_bio->bi_private;
+        multi->orig_bio = first_bio;
+        atomic_set(&multi->stripes_pending, multi->num_stripes);
+        while (dev_nr < total_devs) {
+                if (total_devs > 1) {
+                        if (dev_nr < total_devs - 1) {
+                                bio = bio_clone(first_bio, GFP_NOFS);
+                                BUG_ON(!bio);
+                        } else {
+                                bio = first_bio;
+                        }
+                        bio->bi_private = multi;
+                        bio->bi_end_io = end_bio_multi_stripe;
+                }
+                bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+                dev = multi->stripes[dev_nr].dev;
+                BUG_ON(rw == WRITE && !dev->writeable);
+                if (dev && dev->bdev) {
+                        bio->bi_bdev = dev->bdev;
+                        if (async_submit)
+                                schedule_bio(root, dev, rw, bio);
+                        else
+                                submit_bio(rw, bio);
+                } else {
+                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
+                        bio->bi_sector = logical >> 9;
+                        bio_endio(bio, -EIO);
+                }
+                dev_nr++;
+        }
+        if (total_devs == 1)
+                kfree(multi);
+        return 0;
+}
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+                                       u8 *uuid, u8 *fsid)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *cur_devices;
+        cur_devices = root->fs_info->fs_devices;
+        while (cur_devices) {
+                if (!fsid ||
+                    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+                        device = __find_device(&cur_devices->devices,
+                                               devid, uuid);
+                        if (device)
+                                return device;
+                }
+                cur_devices = cur_devices->seed;
+        }
+        return NULL;
+}
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+                                            u64 devid, u8 *dev_uuid)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        device = kzalloc(sizeof(*device), GFP_NOFS);
+        if (!device)
+                return NULL;
+        list_add(&device->dev_list,
+                 &fs_devices->devices);
+        device->barriers = 1;
+        device->dev_root = root->fs_info->dev_root;
+        device->devid = devid;
+        device->work.func = pending_bios_fn;
+        device->fs_devices = fs_devices;
+        fs_devices->num_devices++;
+        spin_lock_init(&device->io_lock);
+        INIT_LIST_HEAD(&device->dev_alloc_list);
+        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+        return device;
+}
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+                          struct extent_buffer *leaf,
+                          struct btrfs_chunk *chunk)
+{
+        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+        struct map_lookup *map;
+        struct extent_map *em;
+        u64 logical;
+        u64 length;
+        u64 devid;
+        u8 uuid[BTRFS_UUID_SIZE];
+        int num_stripes;
+        int ret;
+        int i;
+        logical = key->offset;
+        length = btrfs_chunk_length(leaf, chunk);
+        spin_lock(&map_tree->map_tree.lock);
+        em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+        spin_unlock(&map_tree->map_tree.lock);
+        /* already mapped? */
+        if (em && em->start <= logical && em->start + em->len > logical) {
+                free_extent_map(em);
+                return 0;
+        } else if (em) {
+                free_extent_map(em);
+        }
+        map = kzalloc(sizeof(*map), GFP_NOFS);
+        if (!map)
+                return -ENOMEM;
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em)
+                return -ENOMEM;
+        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+        if (!map) {
+                free_extent_map(em);
+                return -ENOMEM;
+        }
+        em->bdev = (struct block_device *)map;
+        em->start = logical;
+        em->len = length;
+        em->block_start = 0;
+        em->block_len = em->len;
+        map->num_stripes = num_stripes;
+        map->io_width = btrfs_chunk_io_width(leaf, chunk);
+        map->io_align = btrfs_chunk_io_align(leaf, chunk);
+        map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+        map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+        map->type = btrfs_chunk_type(leaf, chunk);
+        map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+        for (i = 0; i < num_stripes; i++) {
+                map->stripes[i].physical =
+                        btrfs_stripe_offset_nr(leaf, chunk, i);
+                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+                read_extent_buffer(leaf, uuid, (unsigned long)
+                                   btrfs_stripe_dev_uuid_nr(chunk, i),
+                                   BTRFS_UUID_SIZE);
+                map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+                                                        NULL);
+                if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+                        kfree(map);
+                        free_extent_map(em);
+                        return -EIO;
+                }
+                if (!map->stripes[i].dev) {
+                        map->stripes[i].dev =
+                                add_missing_dev(root, devid, uuid);
+                        if (!map->stripes[i].dev) {
+                                kfree(map);
+                                free_extent_map(em);
+                                return -EIO;
+                        }
+                }
+                map->stripes[i].dev->in_fs_metadata = 1;
+        }
+        spin_lock(&map_tree->map_tree.lock);
+        ret = add_extent_mapping(&map_tree->map_tree, em);
+        spin_unlock(&map_tree->map_tree.lock);
+        BUG_ON(ret);
+        free_extent_map(em);
+        return 0;
+}
+static int fill_device_from_item(struct extent_buffer *leaf,
+                                 struct btrfs_dev_item *dev_item,
+                                 struct btrfs_device *device)
+{
+        unsigned long ptr;
+        device->devid = btrfs_device_id(leaf, dev_item);
+        device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+        device->type = btrfs_device_type(leaf, dev_item);
+        device->io_align = btrfs_device_io_align(leaf, dev_item);
+        device->io_width = btrfs_device_io_width(leaf, dev_item);
+        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+        ptr = (unsigned long)btrfs_device_uuid(dev_item);
+        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+        return 0;
+}
+static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+{
+        struct btrfs_fs_devices *fs_devices;
+        int ret;
+        mutex_lock(&uuid_mutex);
+        fs_devices = root->fs_info->fs_devices->seed;
+        while (fs_devices) {
+                if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+                        ret = 0;
+                        goto out;
+                }
+                fs_devices = fs_devices->seed;
+        }
+        fs_devices = find_fsid(fsid);
+        if (!fs_devices) {
+                ret = -ENOENT;
+                goto out;
+        }
+        fs_devices = clone_fs_devices(fs_devices);
+        if (IS_ERR(fs_devices)) {
+                ret = PTR_ERR(fs_devices);
+                goto out;
+        }
+        ret = __btrfs_open_devices(fs_devices, FMODE_READ,
+                                   root->fs_info->bdev_holder);
+        if (ret)
+                goto out;
+        if (!fs_devices->seeding) {
+                __btrfs_close_devices(fs_devices);
+                free_fs_devices(fs_devices);
+                ret = -EINVAL;
+                goto out;
+        }
+        fs_devices->seed = root->fs_info->fs_devices->seed;
+        root->fs_info->fs_devices->seed = fs_devices;
+out:
+        mutex_unlock(&uuid_mutex);
+        return ret;
+}
+static int read_one_dev(struct btrfs_root *root,
+                        struct extent_buffer *leaf,
+                        struct btrfs_dev_item *dev_item)
+{
+        struct btrfs_device *device;
+        u64 devid;
+        int ret;
+        u8 fs_uuid[BTRFS_UUID_SIZE];
+        u8 dev_uuid[BTRFS_UUID_SIZE];
+        devid = btrfs_device_id(leaf, dev_item);
+        read_extent_buffer(leaf, dev_uuid,
+                           (unsigned long)btrfs_device_uuid(dev_item),
+                           BTRFS_UUID_SIZE);
+        read_extent_buffer(leaf, fs_uuid,
+                           (unsigned long)btrfs_device_fsid(dev_item),
+                           BTRFS_UUID_SIZE);
+        if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+                ret = open_seed_devices(root, fs_uuid);
+                if (ret && !btrfs_test_opt(root, DEGRADED))
+                        return ret;
+        }
+        device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+        if (!device || !device->bdev) {
+                if (!btrfs_test_opt(root, DEGRADED))
+                        return -EIO;
+                if (!device) {
+                        printk(KERN_WARNING "warning devid %llu missing\n",
+                               (unsigned long long)devid);
+                        device = add_missing_dev(root, devid, dev_uuid);
+                        if (!device)
+                                return -ENOMEM;
+                }
+        }
+        if (device->fs_devices != root->fs_info->fs_devices) {
+                BUG_ON(device->writeable);
+                if (device->generation !=
+                    btrfs_device_generation(leaf, dev_item))
+                        return -EINVAL;
+        }
+        fill_device_from_item(leaf, dev_item, device);
+        device->dev_root = root->fs_info->dev_root;
+        device->in_fs_metadata = 1;
+        if (device->writeable)
+                device->fs_devices->total_rw_bytes += device->total_bytes;
+        ret = 0;
+        return ret;
+}
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
+{
+        struct btrfs_dev_item *dev_item;
+        dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+                                                     dev_item);
+        return read_one_dev(root, buf, dev_item);
+}
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct extent_buffer *sb;
+        struct btrfs_disk_key *disk_key;
+        struct btrfs_chunk *chunk;
+        u8 *ptr;
+        unsigned long sb_ptr;
+        int ret = 0;
+        u32 num_stripes;
+        u32 array_size;
+        u32 len = 0;
+        u32 cur;
+        struct btrfs_key key;
+        sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+                                          BTRFS_SUPER_INFO_SIZE);
+        if (!sb)
+                return -ENOMEM;
+        btrfs_set_buffer_uptodate(sb);
+        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+        array_size = btrfs_super_sys_array_size(super_copy);
+        ptr = super_copy->sys_chunk_array;
+        sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+        cur = 0;
+        while (cur < array_size) {
+                disk_key = (struct btrfs_disk_key *)ptr;
+                btrfs_disk_key_to_cpu(&key, disk_key);
+                len = sizeof(*disk_key); ptr += len;
+                sb_ptr += len;
+                cur += len;
+                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+                        chunk = (struct btrfs_chunk *)sb_ptr;
+                        ret = read_one_chunk(root, &key, sb, chunk);
+                        if (ret)
+                                break;
+                        num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+                        len = btrfs_chunk_item_size(num_stripes);
+                } else {
+                        ret = -EIO;
+                        break;
+                }
+                ptr += len;
+                sb_ptr += len;
+                cur += len;
+        }
+        free_extent_buffer(sb);
+        return ret;
+}
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        int ret;
+        int slot;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /* first we search for all of the device items, and then we
+         * read in all of the chunk items.  This way we can create chunk
+         * mappings that reference all of the devices that are afound
+         */
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.offset = 0;
+        key.type = 0;
+again:
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        while (1) {
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto error;
+                        break;
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+                        if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+                                break;
+                        if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+                                struct btrfs_dev_item *dev_item;
+                                dev_item = btrfs_item_ptr(leaf, slot,
+                                                  struct btrfs_dev_item);
+                                ret = read_one_dev(root, leaf, dev_item);
+                                if (ret)
+                                        goto error;
+                        }
+                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+                        struct btrfs_chunk *chunk;
+                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+                        ret = read_one_chunk(root, &found_key, leaf, chunk);
+                        if (ret)
+                                goto error;
+                }
+                path->slots[0]++;
+        }
+        if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+                key.objectid = 0;
+                btrfs_release_path(root, path);
+                goto again;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..86c44e9ae110
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+#include <linux/bio.h>
+#include "async-thread.h"
+struct buffer_head;
+struct btrfs_device {
+        struct list_head dev_list;
+        struct list_head dev_alloc_list;
+        struct btrfs_fs_devices *fs_devices;
+        struct btrfs_root *dev_root;
+        struct bio *pending_bios;
+        struct bio *pending_bio_tail;
+        int running_pending;
+        u64 generation;
+        int barriers;
+        int writeable;
+        int in_fs_metadata;
+        spinlock_t io_lock;
+        struct block_device *bdev;
+        /* the mode sent to open_bdev_exclusive */
+        fmode_t mode;
+        char *name;
+        /* the internal btrfs device id */
+        u64 devid;
+        /* size of the device */
+        u64 total_bytes;
+        /* bytes used */
+        u64 bytes_used;
+        /* optimal io alignment for this device */
+        u32 io_align;
+        /* optimal io width for this device */
+        u32 io_width;
+        /* minimal io size for this device */
+        u32 sector_size;
+        /* type and info about this device */
+        u64 type;
+        /* physical drive uuid (or lvm uuid) */
+        u8 uuid[BTRFS_UUID_SIZE];
+        struct btrfs_work work;
+};
+struct btrfs_fs_devices {
+        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+        /* the device with this id has the most recent coyp of the super */
+        u64 latest_devid;
+        u64 latest_trans;
+        u64 num_devices;
+        u64 open_devices;
+        u64 rw_devices;
+        u64 total_rw_bytes;
+        struct block_device *latest_bdev;
+        /* all of the devices in the FS */
+        struct list_head devices;
+        /* devices not currently being allocated */
+        struct list_head alloc_list;
+        struct list_head list;
+        struct btrfs_fs_devices *seed;
+        int seeding;
+        int opened;
+};
+struct btrfs_bio_stripe {
+        struct btrfs_device *dev;
+        u64 physical;
+};
+struct btrfs_multi_bio {
+        atomic_t stripes_pending;
+        bio_end_io_t *end_io;
+        struct bio *orig_bio;
+        void *private;
+        atomic_t error;
+        int max_errors;
+        int num_stripes;
+        struct btrfs_bio_stripe stripes[];
+};
+#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+                            (sizeof(struct btrfs_bio_stripe) * (n)))
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+                           struct btrfs_device *device,
+                           u64 chunk_tree, u64 chunk_objectid,
+                           u64 chunk_offset, u64 start, u64 num_bytes);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+                    u64 logical, u64 *length,
+                    struct btrfs_multi_bio **multi_ret, int mirror_num);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+                     u64 chunk_start, u64 physical, u64 devid,
+                     u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *extent_root, u64 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+                  int mirror_num, int async_submit);
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+                       fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+                          struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+int btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+                      u64 logical, struct page *page);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+                      struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+                                       u8 *uuid, u8 *fsid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_balance(struct btrfs_root *dev_root);
+void btrfs_unlock_volumes(void);
+void btrfs_lock_volumes(void);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..7f332e270894
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+                                void *buffer, size_t size)
+{
+        struct btrfs_dir_item *di;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        int ret = 0;
+        unsigned long data_ptr;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /* lookup the xattr by name */
+        di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+                                strlen(name), 0);
+        if (!di || IS_ERR(di)) {
+                ret = -ENODATA;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        /* if size is 0, that means we want the size of the attr */
+        if (!size) {
+                ret = btrfs_dir_data_len(leaf, di);
+                goto out;
+        }
+        /* now get the data out of our dir_item */
+        if (btrfs_dir_data_len(leaf, di) > size) {
+                ret = -ERANGE;
+                goto out;
+        }
+        data_ptr = (unsigned long)((char *)(di + 1) +
+                                   btrfs_dir_name_len(leaf, di));
+        read_extent_buffer(leaf, buffer, data_ptr,
+                           btrfs_dir_data_len(leaf, di));
+        ret = btrfs_dir_data_len(leaf, di);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int __btrfs_setxattr(struct inode *inode, const char *name,
+                            const void *value, size_t size, int flags)
+{
+        struct btrfs_dir_item *di;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        int ret = 0, mod = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        /* first lets see if we already have this xattr */
+        di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+                                strlen(name), -1);
+        if (IS_ERR(di)) {
+                ret = PTR_ERR(di);
+                goto out;
+        }
+        /* ok we already have this xattr, lets remove it */
+        if (di) {
+                /* if we want create only exit */
+                if (flags & XATTR_CREATE) {
+                        ret = -EEXIST;
+                        goto out;
+                }
+                ret = btrfs_delete_one_dir_name(trans, root, path, di);
+                if (ret)
+                        goto out;
+                btrfs_release_path(root, path);
+                /* if we don't have a value then we are removing the xattr */
+                if (!value) {
+                        mod = 1;
+                        goto out;
+                }
+        } else {
+                btrfs_release_path(root, path);
+                if (flags & XATTR_REPLACE) {
+                        /* we couldn't find the attr to replace */
+                        ret = -ENODATA;
+                        goto out;
+                }
+        }
+        /* ok we have to create a completely new xattr */
+        ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
+                                      value, size, inode->i_ino);
+        if (ret)
+                goto out;
+        mod = 1;
+out:
+        if (mod) {
+                inode->i_ctime = CURRENT_TIME;
+                ret = btrfs_update_inode(trans, root, inode);
+        }
+        btrfs_end_transaction(trans, root);
+        btrfs_free_path(path);
+        return ret;
+}
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        struct btrfs_key key, found_key;
+        struct inode *inode = dentry->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_item *item;
+        struct extent_buffer *leaf;
+        struct btrfs_dir_item *di;
+        int ret = 0, slot, advance;
+        size_t total_size = 0, size_left = size;
+        unsigned long name_ptr;
+        size_t name_len;
+        u32 nritems;
+        /*
+         * ok we want all objects associated with this id.
+         * NOTE: we set key.offset = 0; because we want to start with the
+         * first xattr that we find and walk forward
+         */
+        key.objectid = inode->i_ino;
+        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+        key.offset = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        /* search for our xattrs */
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto err;
+        ret = 0;
+        advance = 0;
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                slot = path->slots[0];
+                /* this is where we start walking through the path */
+                if (advance || slot >= nritems) {
+                        /*
+                         * if we've reached the last slot in this leaf we need
+                         * to go to the next leaf and reset everything
+                         */
+                        if (slot >= nritems-1) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret)
+                                        break;
+                                leaf = path->nodes[0];
+                                nritems = btrfs_header_nritems(leaf);
+                                slot = path->slots[0];
+                        } else {
+                                /*
+                                 * just walking through the slots on this leaf
+                                 */
+                                slot++;
+                                path->slots[0]++;
+                        }
+                }
+                advance = 1;
+                item = btrfs_item_nr(leaf, slot);
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                /* check to make sure this item is what we want */
+                if (found_key.objectid != key.objectid)
+                        break;
+                if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+                        break;
+                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+                name_len = btrfs_dir_name_len(leaf, di);
+                total_size += name_len + 1;
+                /* we are just looking for how big our buffer needs to be */
+                if (!size)
+                        continue;
+                if (!buffer || (name_len + 1) > size_left) {
+                        ret = -ERANGE;
+                        goto err;
+                }
+                name_ptr = (unsigned long)(di + 1);
+                read_extent_buffer(leaf, buffer, name_ptr, name_len);
+                buffer[name_len] = '\0';
+                size_left -= name_len + 1;
+                buffer += name_len + 1;
+        }
+        ret = total_size;
+err:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * List of handlers for synthetic system.* attributes.  All real ondisk
+ * attributes are handled directly.
+ */
+struct xattr_handler *btrfs_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+        &btrfs_xattr_acl_access_handler,
+        &btrfs_xattr_acl_default_handler,
+#endif
+        NULL,
+};
+/*
+ * Check if the attribute is in a supported namespace.
+ *
+ * This applied after the check for the synthetic attributes in the system
+ * namespace.
+ */
+static bool btrfs_is_valid_xattr(const char *name)
+{
+        return !strncmp(name, XATTR_SECURITY_PREFIX,
+                        XATTR_SECURITY_PREFIX_LEN) ||
+               !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
+               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+                       void *buffer, size_t size)
+{
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_getxattr(dentry, name, buffer, size);
+        if (!btrfs_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+}
+int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                   size_t size, int flags)
+{
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_setxattr(dentry, name, value, size, flags);
+        if (!btrfs_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        if (size == 0)
+                value = "";  /* empty EA, do not remove */
+        return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
+}
+int btrfs_removexattr(struct dentry *dentry, const char *name)
+{
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_removexattr(dentry, name);
+        if (!btrfs_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __XATTR__
+#define __XATTR__
+#include <linux/xattr.h>
+extern struct xattr_handler btrfs_xattr_acl_access_handler;
+extern struct xattr_handler btrfs_xattr_acl_default_handler;
+extern struct xattr_handler *btrfs_xattr_handlers[];
+extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+                void *buffer, size_t size);
+extern int __btrfs_setxattr(struct inode *inode, const char *name,
+                const void *value, size_t size, int flags);
+extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+                void *buffer, size_t size);
+extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+                const void *value, size_t size, int flags);
+extern int btrfs_removexattr(struct dentry *dentry, const char *name);
+#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..ecfbce836d32
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include "compression.h"
+/* Plan: call deflate() with avail_in == *sourcelen,
+        avail_out = *dstlen - 12 and flush == Z_FINISH.
+        If it doesn't manage to finish, call it again with
+        avail_in == 0 and avail_out set to the remaining 12
+        bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+struct workspace {
+        z_stream inf_strm;
+        z_stream def_strm;
+        char *buf;
+        struct list_head list;
+};
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+        struct workspace *workspace;
+        int ret;
+        int cpus = num_online_cpus();
+again:
+        spin_lock(&workspace_lock);
+        if (!list_empty(&idle_workspace)) {
+                workspace = list_entry(idle_workspace.next, struct workspace,
+                                       list);
+                list_del(&workspace->list);
+                num_workspace--;
+                spin_unlock(&workspace_lock);
+                return workspace;
+        }
+        spin_unlock(&workspace_lock);
+        if (atomic_read(&alloc_workspace) > cpus) {
+                DEFINE_WAIT(wait);
+                prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+                if (atomic_read(&alloc_workspace) > cpus)
+                        schedule();
+                finish_wait(&workspace_wait, &wait);
+                goto again;
+        }
+        atomic_inc(&alloc_workspace);
+        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+        if (!workspace) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+        if (!workspace->def_strm.workspace) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+        if (!workspace->inf_strm.workspace) {
+                ret = -ENOMEM;
+                goto fail_inflate;
+        }
+        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+        if (!workspace->buf) {
+                ret = -ENOMEM;
+                goto fail_kmalloc;
+        }
+        return workspace;
+fail_kmalloc:
+        vfree(workspace->inf_strm.workspace);
+fail_inflate:
+        vfree(workspace->def_strm.workspace);
+fail:
+        kfree(workspace);
+        atomic_dec(&alloc_workspace);
+        wake_up(&workspace_wait);
+        return ERR_PTR(ret);
+}
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+        spin_lock(&workspace_lock);
+        if (num_workspace < num_online_cpus()) {
+                list_add_tail(&workspace->list, &idle_workspace);
+                num_workspace++;
+                spin_unlock(&workspace_lock);
+                if (waitqueue_active(&workspace_wait))
+                        wake_up(&workspace_wait);
+                return 0;
+        }
+        spin_unlock(&workspace_lock);
+        vfree(workspace->def_strm.workspace);
+        vfree(workspace->inf_strm.workspace);
+        kfree(workspace->buf);
+        kfree(workspace);
+        atomic_dec(&alloc_workspace);
+        if (waitqueue_active(&workspace_wait))
+                wake_up(&workspace_wait);
+        return 0;
+}
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+        struct workspace *workspace;
+        while (!list_empty(&idle_workspace)) {
+                workspace = list_entry(idle_workspace.next, struct workspace,
+                                       list);
+                list_del(&workspace->list);
+                vfree(workspace->def_strm.workspace);
+                vfree(workspace->inf_strm.workspace);
+                kfree(workspace->buf);
+                kfree(workspace);
+                atomic_dec(&alloc_workspace);
+        }
+}
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out)
+{
+        int ret;
+        struct workspace *workspace;
+        char *data_in;
+        char *cpage_out;
+        int nr_pages = 0;
+        struct page *in_page = NULL;
+        struct page *out_page = NULL;
+        int out_written = 0;
+        int in_read = 0;
+        unsigned long bytes_left;
+        *out_pages = 0;
+        *total_out = 0;
+        *total_in = 0;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -1;
+        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+                printk(KERN_WARNING "deflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        workspace->def_strm.total_in = 0;
+        workspace->def_strm.total_out = 0;
+        in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+        data_in = kmap(in_page);
+        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        cpage_out = kmap(out_page);
+        pages[0] = out_page;
+        nr_pages = 1;
+        workspace->def_strm.next_in = data_in;
+        workspace->def_strm.next_out = cpage_out;
+        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+        out_written = 0;
+        in_read = 0;
+        while (workspace->def_strm.total_in < len) {
+                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+                if (ret != Z_OK) {
+                        printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                               ret);
+                        zlib_deflateEnd(&workspace->def_strm);
+                        ret = -1;
+                        goto out;
+                }
+                /* we're making it bigger, give up */
+                if (workspace->def_strm.total_in > 8192 &&
+                    workspace->def_strm.total_in <
+                    workspace->def_strm.total_out) {
+                        ret = -1;
+                        goto out;
+                }
+                /* we need another page for writing out.  Test this
+                 * before the total_in so we will pull in a new page for
+                 * the stream end if required
+                 */
+                if (workspace->def_strm.avail_out == 0) {
+                        kunmap(out_page);
+                        if (nr_pages == nr_dest_pages) {
+                                out_page = NULL;
+                                ret = -1;
+                                goto out;
+                        }
+                        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                        cpage_out = kmap(out_page);
+                        pages[nr_pages] = out_page;
+                        nr_pages++;
+                        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+                        workspace->def_strm.next_out = cpage_out;
+                }
+                /* we're all done */
+                if (workspace->def_strm.total_in >= len)
+                        break;
+                /* we've read in a full page, get a new one */
+                if (workspace->def_strm.avail_in == 0) {
+                        if (workspace->def_strm.total_out > max_out)
+                                break;
+                        bytes_left = len - workspace->def_strm.total_in;
+                        kunmap(in_page);
+                        page_cache_release(in_page);
+                        start += PAGE_CACHE_SIZE;
+                        in_page = find_get_page(mapping,
+                                                start >> PAGE_CACHE_SHIFT);
+                        data_in = kmap(in_page);
+                        workspace->def_strm.avail_in = min(bytes_left,
+                                                           PAGE_CACHE_SIZE);
+                        workspace->def_strm.next_in = data_in;
+                }
+        }
+        workspace->def_strm.avail_in = 0;
+        ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+        zlib_deflateEnd(&workspace->def_strm);
+        if (ret != Z_STREAM_END) {
+                ret = -1;
+                goto out;
+        }
+        if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+                ret = -1;
+                goto out;
+        }
+        ret = 0;
+        *total_out = workspace->def_strm.total_out;
+        *total_in = workspace->def_strm.total_in;
+out:
+        *out_pages = nr_pages;
+        if (out_page)
+                kunmap(out_page);
+        if (in_page) {
+                kunmap(in_page);
+                page_cache_release(in_page);
+        }
+        free_workspace(workspace);
+        return ret;
+}
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                              u64 disk_start,
+                              struct bio_vec *bvec,
+                              int vcnt,
+                              size_t srclen)
+{
+        int ret = 0;
+        int wbits = MAX_WBITS;
+        struct workspace *workspace;
+        char *data_in;
+        size_t total_out = 0;
+        unsigned long page_bytes_left;
+        unsigned long page_in_index = 0;
+        unsigned long page_out_index = 0;
+        struct page *page_out;
+        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                        PAGE_CACHE_SIZE;
+        unsigned long buf_start;
+        unsigned long buf_offset;
+        unsigned long bytes;
+        unsigned long working_bytes;
+        unsigned long pg_offset;
+        unsigned long start_byte;
+        unsigned long current_buf_start;
+        char *kaddr;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -ENOMEM;
+        data_in = kmap(pages_in[page_in_index]);
+        workspace->inf_strm.next_in = data_in;
+        workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+        workspace->inf_strm.total_in = 0;
+        workspace->inf_strm.total_out = 0;
+        workspace->inf_strm.next_out = workspace->buf;
+        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        page_out = bvec[page_out_index].bv_page;
+        page_bytes_left = PAGE_CACHE_SIZE;
+        pg_offset = 0;
+        /* If it's deflate, and it's got no preset dictionary, then
+           we can tell zlib to skip the adler32 check. */
+        if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+            ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+            !(((data_in[0]<<8) + data_in[1]) % 31)) {
+                wbits = -((data_in[0] >> 4) + 8);
+                workspace->inf_strm.next_in += 2;
+                workspace->inf_strm.avail_in -= 2;
+        }
+        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+                printk(KERN_WARNING "inflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        while (workspace->inf_strm.total_in < srclen) {
+                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                if (ret != Z_OK && ret != Z_STREAM_END)
+                        break;
+                /*
+                 * buf start is the byte offset we're of the start of
+                 * our workspace buffer
+                 */
+                buf_start = total_out;
+                /* total_out is the last byte of the workspace buffer */
+                total_out = workspace->inf_strm.total_out;
+                working_bytes = total_out - buf_start;
+                /*
+                 * start byte is the first byte of the page we're currently
+                 * copying into relative to the start of the compressed data.
+                 */
+                start_byte = page_offset(page_out) - disk_start;
+                if (working_bytes == 0) {
+                        /* we didn't make progress in this inflate
+                         * call, we're done
+                         */
+                        if (ret != Z_STREAM_END)
+                                ret = -1;
+                        break;
+                }
+                /* we haven't yet hit data corresponding to this page */
+                if (total_out <= start_byte)
+                        goto next;
+                /*
+                 * the start of the data we care about is offset into
+                 * the middle of our working buffer
+                 */
+                if (total_out > start_byte && buf_start < start_byte) {
+                        buf_offset = start_byte - buf_start;
+                        working_bytes -= buf_offset;
+                } else {
+                        buf_offset = 0;
+                }
+                current_buf_start = buf_start;
+                /* copy bytes from the working buffer into the pages */
+                while (working_bytes > 0) {
+                        bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                                    PAGE_CACHE_SIZE - buf_offset);
+                        bytes = min(bytes, working_bytes);
+                        kaddr = kmap_atomic(page_out, KM_USER0);
+                        memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+                               bytes);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        flush_dcache_page(page_out);
+                        pg_offset += bytes;
+                        page_bytes_left -= bytes;
+                        buf_offset += bytes;
+                        working_bytes -= bytes;
+                        current_buf_start += bytes;
+                        /* check if we need to pick another page */
+                        if (page_bytes_left == 0) {
+                                page_out_index++;
+                                if (page_out_index >= vcnt) {
+                                        ret = 0;
+                                        goto done;
+                                }
+                                page_out = bvec[page_out_index].bv_page;
+                                pg_offset = 0;
+                                page_bytes_left = PAGE_CACHE_SIZE;
+                                start_byte = page_offset(page_out) - disk_start;
+                                /*
+                                 * make sure our new page is covered by this
+                                 * working buffer
+                                 */
+                                if (total_out <= start_byte)
+                                        goto next;
+                                /* the next page in the biovec might not
+                                 * be adjacent to the last page, but it
+                                 * might still be found inside this working
+                                 * buffer.  bump our offset pointer
+                                 */
+                                if (total_out > start_byte &&
+                                    current_buf_start < start_byte) {
+                                        buf_offset = start_byte - buf_start;
+                                        working_bytes = total_out - start_byte;
+                                        current_buf_start = buf_start +
+                                                buf_offset;
+                                }
+                        }
+                }
+next:
+                workspace->inf_strm.next_out = workspace->buf;
+                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+                if (workspace->inf_strm.avail_in == 0) {
+                        unsigned long tmp;
+                        kunmap(pages_in[page_in_index]);
+                        page_in_index++;
+                        if (page_in_index >= total_pages_in) {
+                                data_in = NULL;
+                                break;
+                        }
+                        data_in = kmap(pages_in[page_in_index]);
+                        workspace->inf_strm.next_in = data_in;
+                        tmp = srclen - workspace->inf_strm.total_in;
+                        workspace->inf_strm.avail_in = min(tmp,
+                                                           PAGE_CACHE_SIZE);
+                }
+        }
+        if (ret != Z_STREAM_END)
+                ret = -1;
+        else
+                ret = 0;
+done:
+        zlib_inflateEnd(&workspace->inf_strm);
+        if (data_in)
+                kunmap(pages_in[page_in_index]);
+out:
+        free_workspace(workspace);
+        return ret;
+}
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen)
+{
+        int ret = 0;
+        int wbits = MAX_WBITS;
+        struct workspace *workspace;
+        unsigned long bytes_left = destlen;
+        unsigned long total_out = 0;
+        char *kaddr;
+        if (destlen > PAGE_CACHE_SIZE)
+                return -ENOMEM;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -ENOMEM;
+        workspace->inf_strm.next_in = data_in;
+        workspace->inf_strm.avail_in = srclen;
+        workspace->inf_strm.total_in = 0;
+        workspace->inf_strm.next_out = workspace->buf;
+        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->inf_strm.total_out = 0;
+        /* If it's deflate, and it's got no preset dictionary, then
+           we can tell zlib to skip the adler32 check. */
+        if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+            ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+            !(((data_in[0]<<8) + data_in[1]) % 31)) {
+                wbits = -((data_in[0] >> 4) + 8);
+                workspace->inf_strm.next_in += 2;
+                workspace->inf_strm.avail_in -= 2;
+        }
+        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+                printk(KERN_WARNING "inflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        while (bytes_left > 0) {
+                unsigned long buf_start;
+                unsigned long buf_offset;
+                unsigned long bytes;
+                unsigned long pg_offset = 0;
+                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                if (ret != Z_OK && ret != Z_STREAM_END)
+                        break;
+                buf_start = total_out;
+                total_out = workspace->inf_strm.total_out;
+                if (total_out == buf_start) {
+                        ret = -1;
+                        break;
+                }
+                if (total_out <= start_byte)
+                        goto next;
+                if (total_out > start_byte && buf_start < start_byte)
+                        buf_offset = start_byte - buf_start;
+                else
+                        buf_offset = 0;
+                bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                            PAGE_CACHE_SIZE - buf_offset);
+                bytes = min(bytes, bytes_left);
+                kaddr = kmap_atomic(dest_page, KM_USER0);
+                memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+                kunmap_atomic(kaddr, KM_USER0);
+                pg_offset += bytes;
+                bytes_left -= bytes;
+next:
+                workspace->inf_strm.next_out = workspace->buf;
+                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        }
+        if (ret != Z_STREAM_END && bytes_left != 0)
+                ret = -1;
+        else
+                ret = 0;
+        zlib_inflateEnd(&workspace->inf_strm);
+out:
+        free_workspace(workspace);
+        return ret;
+}
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}
diff --git a/fs/buffer.c b/fs/buffer.c
index 776ae091d3b0..b58208f1640a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -203,10 +203,25 @@ int fsync_bdev(struct block_device *bdev)
 * happen on bdev until thaw_bdev() is called.
 * If a superblock is found on this device, we take the s_umount semaphore
 * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
 */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
        struct super_block *sb;
+        int error = 0;
+        mutex_lock(&bdev->bd_fsfreeze_mutex);
+        if (bdev->bd_fsfreeze_count > 0) {
+                bdev->bd_fsfreeze_count++;
+                sb = get_super(bdev);
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                return sb;
+        }
+        bdev->bd_fsfreeze_count++;
        down(&bdev->bd_mount_sem);
        sb = get_super(bdev);
@@ -221,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev)
                sync_blockdev(sb->s_bdev);
-                if (sb->s_op->write_super_lockfs)
+                if (sb->s_op->freeze_fs) {
-                        sb->s_op->write_super_lockfs(sb);
+                        error = sb->s_op->freeze_fs(sb);
+                        if (error) {
+                                printk(KERN_ERR
+                                        "VFS:Filesystem freeze failed\n");
+                                sb->s_frozen = SB_UNFROZEN;
+                                drop_super(sb);
+                                up(&bdev->bd_mount_sem);
+                                bdev->bd_fsfreeze_count--;
+                                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                                return ERR_PTR(error);
+                        }
+                }
        }
        sync_blockdev(bdev);
+        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
 }
 EXPORT_SYMBOL(freeze_bdev);
@@ -237,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev);
 *
 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 */
-void thaw_bdev(struct block_device *bdev, struct super_block *sb)
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
+        int error = 0;
+        mutex_lock(&bdev->bd_fsfreeze_mutex);
+        if (!bdev->bd_fsfreeze_count) {
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                return -EINVAL;
+        }
+        bdev->bd_fsfreeze_count--;
+        if (bdev->bd_fsfreeze_count > 0) {
+                if (sb)
+                        drop_super(sb);
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                return 0;
+        }
        if (sb) {
                BUG_ON(sb->s_bdev != bdev);
+                if (!(sb->s_flags & MS_RDONLY)) {
-                if (sb->s_op->unlockfs)
+                        if (sb->s_op->unfreeze_fs) {
-                        sb->s_op->unlockfs(sb);
+                                error = sb->s_op->unfreeze_fs(sb);
-                sb->s_frozen = SB_UNFROZEN;
+                                if (error) {
-                smp_wmb();
+                                        printk(KERN_ERR
-                wake_up(&sb->s_wait_unfrozen);
+                                                "VFS:Filesystem thaw failed\n");
+                                        sb->s_frozen = SB_FREEZE_TRANS;
+                                        bdev->bd_fsfreeze_count++;
+                                        mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                                        return error;
+                                }
+                        }
+                        sb->s_frozen = SB_UNFROZEN;
+                        smp_wmb();
+                        wake_up(&sb->s_wait_unfrozen);
+                }
                drop_super(sb);
        }
        up(&bdev->bd_mount_sem);
+        mutex_unlock(&bdev->bd_fsfreeze_mutex);
+        return 0;
 }
 EXPORT_SYMBOL(thaw_bdev);
@@ -1996,7 +2052,7 @@ int block_write_begin(struct file *file, struct address_space *mapping,
        page = *pagep;
        if (page == NULL) {
                ownpage = 1;
-                page = __grab_cache_page(mapping, index);
+                page = grab_cache_page_write_begin(mapping, index, flags);
                if (!page) {
                        status = -ENOMEM;
                        goto out;
@@ -2022,7 +2078,6 @@ int block_write_begin(struct file *file, struct address_space *mapping,
                        if (pos + len > inode->i_size)
                                vmtruncate(inode, inode->i_size);
                }
-                goto out;
        }
 out:
@@ -2502,7 +2557,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
@@ -3188,7 +3243,7 @@ void block_sync_page(struct page *page)
 * Use of bdflush() is deprecated and will be removed in a future kernel.
 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
 */
-asmlinkage long sys_bdflush(int func, long data)
+SYSCALL_DEFINE2(bdflush, int, func, long, data)
 {
        static int msg_count;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 700697a72618..38f71222a552 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
        cd->major = major;
        cd->baseminor = baseminor;
        cd->minorct = minorct;
-        strncpy(cd->name,name, 64);
+        strlcpy(cd->name, name, sizeof(cd->name));
        i = major_to_index(major);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 080703a15f44..73ac7ebd1dfc 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -5,7 +5,9 @@ rather than posix (advisory) byte range locks, even though server would
 support posix byte range locks.  Fix query of root inode when prefixpath
 specified and user does not have access to query information about the
 top of the share.  Fix problem in 2.6.28 resolving DFS paths to
-Samba servers (worked to Windows).
+Samba servers (worked to Windows).  Fix rmdir so that pending search
+(readdir) requests do not get invalid results which include the now
+removed directory.
 Version 1.55
 ------------
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 6ba43fb346fb..9948c0030e86 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \
+          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
          readdir.o ioctl.o sess.o export.o cifsacl.o
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d4839cf0cb2c..7c9809523f42 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -48,11 +48,11 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
        if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
                return -EINVAL;
-        MD5Init(&context);
+        cifs_MD5_init(&context);
-        MD5Update(&context, (char *)&key->data, key->len);
+        cifs_MD5_update(&context, (char *)&key->data, key->len);
-        MD5Update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+        cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
-        MD5Final(signature, &context);
+        cifs_MD5_final(signature, &context);
        return 0;
 }
@@ -96,8 +96,8 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
        if ((iov == NULL) || (signature == NULL) || (key == NULL))
                return -EINVAL;
-        MD5Init(&context);
+        cifs_MD5_init(&context);
-        MD5Update(&context, (char *)&key->data, key->len);
+        cifs_MD5_update(&context, (char *)&key->data, key->len);
        for (i = 0; i < n_vec; i++) {
                if (iov[i].iov_len == 0)
                        continue;
@@ -110,13 +110,13 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
                if (i == 0) {
                        if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
                                break; /* nothing to sign or corrupt header */
-                        MD5Update(&context, iov[0].iov_base+4,
+                        cifs_MD5_update(&context, iov[0].iov_base+4,
                                  iov[0].iov_len-4);
                } else
-                        MD5Update(&context, iov[i].iov_base, iov[i].iov_len);
+                        cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
        }
-        MD5Final(signature, &context);
+        cifs_MD5_final(signature, &context);
        return 0;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0005a194a75c..13ea53251dcf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = {
        .readdir = cifs_readdir,
        .release = cifs_closedir,
        .read    = generic_read_dir,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
        .unlocked_ioctl  = cifs_ioctl,
        .llseek = generic_file_llseek,
 };
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2ce04c73d74e..7ac481841f87 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
-extern int cifs_dir_notify(struct file *, unsigned long arg);
 /* Functions related to dir entries */
 extern struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 06f6779988bf..382ba6298809 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -35,8 +35,8 @@ extern struct smb_hdr *cifs_buf_get(void);
 extern void cifs_buf_release(void *);
 extern struct smb_hdr *cifs_small_buf_get(void);
 extern void cifs_small_buf_release(void *);
-extern int smb_send(struct socket *, struct smb_hdr *,
+extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
-                        unsigned int /* length */ , struct sockaddr *, bool);
+                        unsigned int /* length */);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e9ea394ee075..2209be943051 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1354,7 +1354,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 }
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr)
 {
        struct list_head *tmp;
        struct TCP_Server_Info *server;
@@ -1374,11 +1374,11 @@ cifs_find_tcp_session(struct sockaddr *addr)
                if (server->tcpStatus == CifsNew)
                        continue;
-                if (addr->sa_family == AF_INET &&
+                if (addr->ss_family == AF_INET &&
                    (addr4->sin_addr.s_addr !=
                     server->addr.sockAddr.sin_addr.s_addr))
                        continue;
-                else if (addr->sa_family == AF_INET6 &&
+                else if (addr->ss_family == AF_INET6 &&
                         memcmp(&server->addr.sockAddr6.sin6_addr,
                                &addr6->sin6_addr, sizeof(addr6->sin6_addr)))
                        continue;
@@ -1419,12 +1419,12 @@ static struct TCP_Server_Info *
 cifs_get_tcp_session(struct smb_vol *volume_info)
 {
        struct TCP_Server_Info *tcp_ses = NULL;
-        struct sockaddr addr;
+        struct sockaddr_storage addr;
        struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
        struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
        int rc;
-        memset(&addr, 0, sizeof(struct sockaddr));
+        memset(&addr, 0, sizeof(struct sockaddr_storage));
        if (volume_info->UNCip && volume_info->UNC) {
                rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
@@ -1435,9 +1435,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                        rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
                                            &sin_server6->sin6_addr.in6_u);
                        if (rc > 0)
-                                addr.sa_family = AF_INET6;
+                                addr.ss_family = AF_INET6;
                } else {
-                        addr.sa_family = AF_INET;
+                        addr.ss_family = AF_INET;
                }
                if (rc <= 0) {
@@ -1502,7 +1502,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        tcp_ses->tcpStatus = CifsNew;
        ++tcp_ses->srv_count;
-        if (addr.sa_family == AF_INET6) {
+        if (addr.ss_family == AF_INET6) {
                cFYI(1, ("attempting ipv6 connect"));
                /* BB should we allow ipv6 on port 139? */
                /* other OS never observed in Wild doing 139 with v6 */
@@ -1802,7 +1802,7 @@ ipv4_connect(struct TCP_Server_Info *server)
         *  user space buffer
         */
        socket->sk->sk_rcvtimeo = 7 * HZ;
-        socket->sk->sk_sndtimeo = 3 * HZ;
+        socket->sk->sk_sndtimeo = 5 * HZ;
        /* make the bufsizes depend on wsize/rsize and max requests */
        if (server->noautotune) {
@@ -1860,9 +1860,7 @@ ipv4_connect(struct TCP_Server_Info *server)
                        smb_buf = (struct smb_hdr *)ses_init_buf;
                        /* sizeof RFC1002_SESSION_REQUEST with no scope */
                        smb_buf->smb_buf_length = 0x81000044;
-                        rc = smb_send(socket, smb_buf, 0x44,
+                        rc = smb_send(server, smb_buf, 0x44);
-                                (struct sockaddr *) &server->addr.sockAddr,
-                                server->noblocksnd);
                        kfree(ses_init_buf);
                        msleep(1); /* RFC1001 layer in at least one server
                                      requires very short break before negprot
@@ -1955,7 +1953,7 @@ ipv6_connect(struct TCP_Server_Info *server)
         * user space buffer
         */
        socket->sk->sk_rcvtimeo = 7 * HZ;
-        socket->sk->sk_sndtimeo = 3 * HZ;
+        socket->sk->sk_sndtimeo = 5 * HZ;
        server->ssocket = socket;
        return rc;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 838d9c720a5c..964aad03c5ad 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,6 +129,17 @@ cifs_bp_rename_retry:
        return full_path;
 }
+static void setup_cifs_dentry(struct cifsTconInfo *tcon,
+                              struct dentry *direntry,
+                              struct inode *newinode)
+{
+        if (tcon->nocase)
+                direntry->d_op = &cifs_ci_dentry_ops;
+        else
+                direntry->d_op = &cifs_dentry_ops;
+        d_instantiate(direntry, newinode);
+}
 /* Inode operations in similar order to how they appear in Linux file fs.h */
 int
@@ -139,14 +150,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        int xid;
        int create_options = CREATE_NOT_DIR;
        int oplock = 0;
+        /* BB below access is too much for the mknod to request */
        int desiredAccess = GENERIC_READ | GENERIC_WRITE;
        __u16 fileHandle;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifsTconInfo *tcon;
        char *full_path = NULL;
        FILE_ALL_INFO *buf = NULL;
        struct inode *newinode = NULL;
-        struct cifsFileInfo *pCifsFile = NULL;
        struct cifsInodeInfo *pCifsInode;
        int disposition = FILE_OVERWRITE_IF;
        bool write_only = false;
@@ -154,7 +165,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        xid = GetXid();
        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
+        tcon = cifs_sb->tcon;
        full_path = build_path_from_dentry(direntry);
        if (full_path == NULL) {
@@ -162,6 +173,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                return -ENOMEM;
        }
+        mode &= ~current->fs->umask;
        if (nd && (nd->flags & LOOKUP_OPEN)) {
                int oflags = nd->intent.open.flags;
@@ -196,17 +209,15 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                return -ENOMEM;
        }
-        mode &= ~current->fs->umask;
        /*
         * if we're not using unix extensions, see if we need to set
         * ATTR_READONLY on the create call
         */
-        if (!pTcon->unix_ext && (mode & S_IWUGO) == 0)
+        if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
                create_options |= CREATE_OPTION_READONLY;
        if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
-                rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
+                rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
                         desiredAccess, create_options,
                         &fileHandle, &oplock, buf, cifs_sb->local_nls,
                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -215,7 +226,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        if (rc == -EIO) {
                /* old server, retry the open legacy style */
-                rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
+                rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
                        desiredAccess, create_options,
                        &fileHandle, &oplock, buf, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -225,7 +236,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
        } else {
                /* If Open reported that we actually created a file
                then we now have to set the mode if possible */
-                if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+                if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
                        struct cifs_unix_set_info_args args = {
                                .mode   = mode,
                                .ctime  = NO_CHANGE_64,
@@ -244,20 +255,20 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                                args.uid = NO_CHANGE_64;
                                args.gid = NO_CHANGE_64;
                        }
-                        CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+                        CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
                                cifs_sb->local_nls,
                                cifs_sb->mnt_cifs_flags &
                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                } else {
                        /* BB implement mode setting via Windows security
                           descriptors e.g. */
-                        /* CIFSSMBWinSetPerms(xid,pTcon,path,mode,-1,-1,nls);*/
+                        /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
                        /* Could set r/o dos attribute if mode & 0222 == 0 */
                }
                /* server might mask mode so we have to query for it */
-                if (pTcon->unix_ext)
+                if (tcon->unix_ext)
                        rc = cifs_get_inode_info_unix(&newinode, full_path,
                                                 inode->i_sb, xid);
                else {
@@ -283,22 +294,17 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                }
                if (rc != 0) {
-                        cFYI(1,
+                        cFYI(1, ("Create worked, get_inode_info failed rc = %d",
-                             ("Create worked but get_inode_info failed rc = %d",
+                                 rc));
-                              rc));
+                } else
-                } else {
+                        setup_cifs_dentry(tcon, direntry, newinode);
-                        if (pTcon->nocase)
-                                direntry->d_op = &cifs_ci_dentry_ops;
-                        else
-                                direntry->d_op = &cifs_dentry_ops;
-                        d_instantiate(direntry, newinode);
-                }
                if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
                        (!(nd->flags & LOOKUP_OPEN))) {
                        /* mknod case - do not leave file open */
-                        CIFSSMBClose(xid, pTcon, fileHandle);
+                        CIFSSMBClose(xid, tcon, fileHandle);
                } else if (newinode) {
-                        pCifsFile =
+                        struct cifsFileInfo *pCifsFile =
                           kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
                        if (pCifsFile == NULL)
@@ -316,7 +322,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        /* set the following in open now
                                pCifsFile->pfile = file; */
                        write_lock(&GlobalSMBSeslock);
-                        list_add(&pCifsFile->tlist, &pTcon->openFileList);
+                        list_add(&pCifsFile->tlist, &tcon->openFileList);
                        pCifsInode = CIFS_I(newinode);
                        if (pCifsInode) {
                                /* if readable file instance put first in list*/
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
deleted file mode 100644
index 5a57581eb4b2..000000000000
--- a/fs/cifs/fcntl.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *   fs/cifs/fcntl.c
- *
- *   vfs operations that deal with the file control API
- *
- *   Copyright (C) International Business Machines  Corp., 2003,2004
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include "cifsglob.h"
-#include "cifsproto.h"
-#include "cifs_unicode.h"
-#include "cifs_debug.h"
-#include "cifsfs.h"
-static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
-{
-        __u32 cifs_ntfy_flags = 0;
-        /* No way on Linux VFS to ask to monitor xattr
-        changes (and no stream support either */
-        if (fcntl_notify_flags & DN_ACCESS)
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
-        if (fcntl_notify_flags & DN_MODIFY) {
-                /* What does this mean on directories? */
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
-                        FILE_NOTIFY_CHANGE_SIZE;
-        }
-        if (fcntl_notify_flags & DN_CREATE) {
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
-                        FILE_NOTIFY_CHANGE_LAST_WRITE;
-        }
-        if (fcntl_notify_flags & DN_DELETE)
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
-        if (fcntl_notify_flags & DN_RENAME) {
-                /* BB review this - checking various server behaviors */
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
-                        FILE_NOTIFY_CHANGE_FILE_NAME;
-        }
-        if (fcntl_notify_flags & DN_ATTRIB) {
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY |
-                        FILE_NOTIFY_CHANGE_ATTRIBUTES;
-        }
-/*      if (fcntl_notify_flags & DN_MULTISHOT) {
-                cifs_ntfy_flags |= ;
-        } */ /* BB fixme - not sure how to handle this with CIFS yet */
-        return cifs_ntfy_flags;
-}
-int cifs_dir_notify(struct file *file, unsigned long arg)
-{
-        int xid;
-        int rc = -EINVAL;
-        int oplock = 0;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
-        char *full_path = NULL;
-        __u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES;
-        __u16 netfid;
-        if (experimEnabled == 0)
-                return 0;
-        xid = GetXid();
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
-        full_path = build_path_from_dentry(file->f_path.dentry);
-        if (full_path == NULL) {
-                rc = -ENOMEM;
-        } else {
-                cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg));
-                rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-                        GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
-                        &netfid, &oplock, NULL, cifs_sb->local_nls,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                /* BB fixme - add this handle to a notify handle list */
-                if (rc) {
-                        cFYI(1, ("Could not open directory for notify"));
-                } else {
-                        filter = convert_to_cifs_notify_flags(arg);
-                        if (filter != 0) {
-                                rc = CIFSSMBNotify(xid, pTcon,
-                                        0 /* no subdirs */, netfid,
-                                        filter, file, arg & DN_MULTISHOT,
-                                        cifs_sb->local_nls);
-                        } else {
-                                rc = -EINVAL;
-                        }
-                        /* BB add code to close file eventually (at unmount
-                        it would close automatically but may be a way
-                        to do it easily when inode freed or when
-                        notify info is cleared/changed */
-                        cFYI(1, ("notify rc %d", rc));
-                }
-        }
-        FreeXid(xid);
-        return rc;
-}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b1e1fc6a6e6a..12bb656fbe75 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
        cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                rc = -ENOMEM;
                goto out;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4edc..bcf7b5184664 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1285,6 +1285,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        cifsInode = CIFS_I(direntry->d_inode);
        cifsInode->time = 0;    /* force revalidate to go get info when
                                   needed */
+        cifsInode = CIFS_I(inode);
+        cifsInode->time = 0;    /* force revalidate to get parent dir info
+                                   since cached search results now invalid */
        direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
                current_fs_time(inode->i_sb);
@@ -1641,7 +1646,7 @@ do_expand:
        i_size_write(inode, offset);
        spin_unlock(&inode->i_lock);
 out_truncate:
-        if (inode->i_op && inode->i_op->truncate)
+        if (inode->i_op->truncate)
                inode->i_op->truncate(inode);
        return 0;
 out_sig:
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 462bbfefd4b6..98b66a54c319 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -10,8 +10,8 @@
 * with every copy.
 *
 * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
- * needed on buffers full of bytes, and then call MD5Final, which
+ * needed on buffers full of bytes, and then call cifs_MD5_final, which
 * will fill a supplied 16-byte array with the digest.
 */
@@ -45,7 +45,7 @@ byteReverse(unsigned char *buf, unsigned longs)
 * initialization constants.
 */
 void
-MD5Init(struct MD5Context *ctx)
+cifs_MD5_init(struct MD5Context *ctx)
 {
        ctx->buf[0] = 0x67452301;
        ctx->buf[1] = 0xefcdab89;
@@ -61,7 +61,7 @@ MD5Init(struct MD5Context *ctx)
 * of bytes.
 */
 void
-MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
+cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
 {
        register __u32 t;
@@ -110,7 +110,7 @@ MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
 * 1 0* (64-bit count of bits processed, MSB-first)
 */
 void
-MD5Final(unsigned char digest[16], struct MD5Context *ctx)
+cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
 {
        unsigned int count;
        unsigned char *p;
@@ -165,7 +165,7 @@ MD5Final(unsigned char digest[16], struct MD5Context *ctx)
 /*
 * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * reflect the addition of 16 longwords of new data.  cifs_MD5_update blocks
 * the data and converts bytes into longwords for this routine.
 */
 static void
@@ -267,9 +267,9 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
                unsigned char tk[16];
                struct MD5Context tctx;
-                MD5Init(&tctx);
+                cifs_MD5_init(&tctx);
-                MD5Update(&tctx, key, key_len);
+                cifs_MD5_update(&tctx, key, key_len);
-                MD5Final(tk, &tctx);
+                cifs_MD5_final(tk, &tctx);
                key = tk;
                key_len = 16;
@@ -287,8 +287,8 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
                ctx->k_opad[i] ^= 0x5c;
        }
-        MD5Init(&ctx->ctx);
+        cifs_MD5_init(&ctx->ctx);
-        MD5Update(&ctx->ctx, ctx->k_ipad, 64);
+        cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
 }
 #endif
@@ -317,8 +317,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
                ctx->k_opad[i] ^= 0x5c;
        }
-        MD5Init(&ctx->ctx);
+        cifs_MD5_init(&ctx->ctx);
-        MD5Update(&ctx->ctx, ctx->k_ipad, 64);
+        cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
 }
 /***********************************************************************
@@ -328,7 +328,7 @@ void
 hmac_md5_update(const unsigned char *text, int text_len,
                struct HMACMD5Context *ctx)
 {
-        MD5Update(&ctx->ctx, text, text_len);   /* then text of datagram */
+        cifs_MD5_update(&ctx->ctx, text, text_len);     /* then text of datagram */
 }
 /***********************************************************************
@@ -339,12 +339,12 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
 {
        struct MD5Context ctx_o;
-        MD5Final(digest, &ctx->ctx);
+        cifs_MD5_final(digest, &ctx->ctx);
-        MD5Init(&ctx_o);
+        cifs_MD5_init(&ctx_o);
-        MD5Update(&ctx_o, ctx->k_opad, 64);
+        cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
-        MD5Update(&ctx_o, digest, 16);
+        cifs_MD5_update(&ctx_o, digest, 16);
-        MD5Final(digest, &ctx_o);
+        cifs_MD5_final(digest, &ctx_o);
 }
 /***********************************************************
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index f7d4f4197bac..6fba8cb402fd 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -20,10 +20,10 @@ struct HMACMD5Context {
 };
 #endif                          /* _HMAC_MD5_H */
-void MD5Init(struct MD5Context *context);
+void cifs_MD5_init(struct MD5Context *context);
-void MD5Update(struct MD5Context *context, unsigned char const *buf,
+void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
                        unsigned len);
-void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
 /* The following definitions come from lib/hmacmd5.c  */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ebe6599ed3a..0ad3e2d116a6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -154,81 +154,8 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
        spin_unlock(&GlobalMid_Lock);
 }
-int
-smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
-         unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
-{
-        int rc = 0;
-        int i = 0;
-        struct msghdr smb_msg;
-        struct kvec iov;
-        unsigned len = smb_buf_length + 4;
-        if (ssocket == NULL)
-                return -ENOTSOCK; /* BB eventually add reconnect code here */
-        iov.iov_base = smb_buffer;
-        iov.iov_len = len;
-        smb_msg.msg_name = sin;
-        smb_msg.msg_namelen = sizeof(struct sockaddr);
-        smb_msg.msg_control = NULL;
-        smb_msg.msg_controllen = 0;
-        if (noblocksnd)
-                smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
-        else
-                smb_msg.msg_flags = MSG_NOSIGNAL;
-        /* smb header is converted in header_assemble. bcc and rest of SMB word
-           area, and byte area if necessary, is converted to littleendian in
-           cifssmb.c and RFC1001 len is converted to bigendian in smb_send
-           Flags2 is converted in SendReceive */
-        smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
-        cFYI(1, ("Sending smb of length %d", smb_buf_length));
-        dump_smb(smb_buffer, len);
-        while (len > 0) {
-                rc = kernel_sendmsg(ssocket, &smb_msg, &iov, 1, len);
-                if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
-                        i++;
-                /* smaller timeout here than send2 since smaller size */
-                /* Although it may not be required, this also is smaller
-                   oplock break time */
-                        if (i > 12) {
-                                cERROR(1,
-                                   ("sends on sock %p stuck for 7 seconds",
-                                    ssocket));
-                                rc = -EAGAIN;
-                                break;
-                        }
-                        msleep(1 << i);
-                        continue;
-                }
-                if (rc < 0)
-                        break;
-                else
-                        i = 0; /* reset i after each successful send */
-                iov.iov_base += rc;
-                iov.iov_len -= rc;
-                len -= rc;
-        }
-        if (rc < 0) {
-                cERROR(1, ("Error %d sending data on socket to server", rc));
-        } else {
-                rc = 0;
-        }
-        /* Don't want to modify the buffer as a
-           side effect of this call. */
-        smb_buffer->smb_buf_length = smb_buf_length;
-        return rc;
-}
 static int
-smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
+smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
-          struct sockaddr *sin, bool noblocksnd)
 {
        int rc = 0;
        int i = 0;
@@ -243,11 +170,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
        if (ssocket == NULL)
                return -ENOTSOCK; /* BB eventually add reconnect code here */
-        smb_msg.msg_name = sin;
+        smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
        smb_msg.msg_namelen = sizeof(struct sockaddr);
        smb_msg.msg_control = NULL;
        smb_msg.msg_controllen = 0;
-        if (noblocksnd)
+        if (server->noblocksnd)
                smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
        else
                smb_msg.msg_flags = MSG_NOSIGNAL;
@@ -272,7 +199,25 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
                                    n_vec - first_vec, total_len);
                if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
                        i++;
-                        if (i >= 14) {
+                        /* if blocking send we try 3 times, since each can block
+                           for 5 seconds. For nonblocking  we have to try more
+                           but wait increasing amounts of time allowing time for
+                           socket to clear.  The overall time we wait in either
+                           case to send on the socket is about 15 seconds.
+                           Similarly we wait for 15 seconds for
+                           a response from the server in SendReceive[2]
+                           for the server to send a response back for
+                           most types of requests (except SMB Write
+                           past end of file which can be slow, and
+                           blocking lock operations). NFS waits slightly longer
+                           than CIFS, but this can make it take longer for
+                           nonresponsive servers to be detected and 15 seconds
+                           is more than enough time for modern networks to
+                           send a packet.  In most cases if we fail to send
+                           after the retries we will kill the socket and
+                           reconnect which may clear the network problem.
+                        */
+                        if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
                                cERROR(1,
                                   ("sends on sock %p stuck for 15 seconds",
                                    ssocket));
@@ -339,6 +284,18 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
        return rc;
 }
+int
+smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
+         unsigned int smb_buf_length)
+{
+        struct kvec iov;
+        iov.iov_base = smb_buffer;
+        iov.iov_len = smb_buf_length + 4;
+        return smb_sendv(server, &iov, 1);
+}
 static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
 {
        if (long_op == CIFS_ASYNC_OP) {
@@ -540,9 +497,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
        atomic_inc(&ses->server->inSend);
 #endif
-        rc = smb_send2(ses->server, iov, n_vec,
+        rc = smb_sendv(ses->server, iov, n_vec);
-                      (struct sockaddr *) &(ses->server->addr.sockAddr),
-                       ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
@@ -736,9 +691,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
        atomic_inc(&ses->server->inSend);
 #endif
-        rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
+        rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-                      (struct sockaddr *) &(ses->server->addr.sockAddr),
-                      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
@@ -879,9 +832,7 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
                mutex_unlock(&ses->server->srv_mutex);
                return rc;
        }
-        rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
+        rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-              (struct sockaddr *) &(ses->server->addr.sockAddr),
-              ses->server->noblocksnd);
        mutex_unlock(&ses->server->srv_mutex);
        return rc;
 }
@@ -973,9 +924,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 #ifdef CONFIG_CIFS_STATS2
        atomic_inc(&ses->server->inSend);
 #endif
-        rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
+        rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
-                      (struct sockaddr *) &(ses->server->addr.sockAddr),
-                      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig
new file mode 100644
index 000000000000..c0e5a7fad06d
--- /dev/null
+++ b/fs/coda/Kconfig
@@ -0,0 +1,21 @@
+config CODA_FS
+        tristate "Coda file system support (advanced network fs)"
+        depends on INET
+        help
+          Coda is an advanced network file system, similar to NFS in that it
+          enables you to mount file systems of a remote server and access them
+          with regular Unix commands as if they were sitting on your hard
+          disk.  Coda has several advantages over NFS: support for
+          disconnected operation (e.g. for laptops), read/write server
+          replication, security model for authentication and encryption,
+          persistent client caches and write back caching.
+          If you say Y here, your Linux box will be able to act as a Coda
+          *client*.  You will need user level code as well, both for the
+          client and server.  Servers are currently user level, i.e. they need
+          no kernel support.  Please read
+          <file:Documentation/filesystems/coda.txt> and check out the Coda
+          home page <http://www.coda.cs.cmu.edu/>.
+          To compile the coda client support as a module, choose M here: the
+          module will be called coda.
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df6..6a347fbc998a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 {
        struct file *host_file;
-        struct dentry *host_dentry;
+        struct inode *coda_inode = coda_dentry->d_inode;
-        struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
        struct coda_file_info *cfi;
        int err = 0;
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
-        if (host_file->f_op && host_file->f_op->fsync) {
+        err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
-                host_dentry = host_file->f_path.dentry;
-                host_inode = host_dentry->d_inode;
-                mutex_lock(&host_inode->i_mutex);
-                err = host_file->f_op->fsync(host_file, host_dentry, datasync);
-                mutex_unlock(&host_inode->i_mutex);
-        }
        if ( !err && !datasync ) {
                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c6465..43c96ce29614 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
 #include "coda_int.h"
+#ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
+#endif
 static ctl_table coda_table[] = {
        {
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
        {}
 };
+#ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
        },
        {}
 };
+#endif
 void coda_sysctl_init(void)
 {
diff --git a/fs/compat.c b/fs/compat.c
index d1ece79b6411..65a070e705ab 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
        ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
 out:
+        if (ret > 0)
+                add_rchar(current, ret);
+        inc_syscr(current);
        fput(file);
        return ret;
 }
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
        ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
 out:
+        if (ret > 0)
+                add_wchar(current, ret);
+        inc_syscw(current);
        fput(file);
        return ret;
 }
@@ -1703,7 +1709,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 }
 #ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
+static long do_compat_pselect(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
        compat_size_t sigsetsize)
@@ -1769,8 +1775,8 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
                                (compat_size_t __user *)(sig+sizeof(up))))
                        return -EFAULT;
        }
-        return compat_sys_pselect7(n, inp, outp, exp, tsp, compat_ptr(up),
+        return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
-                                        sigsetsize);
+                                 sigsetsize);
 }
 asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5235c67e7594..c8f8d5904f5e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -538,6 +538,7 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
                 * cannot be fixed without breaking all existing apps.
                 */
                case TUNSETIFF:
+                case TUNGETIFF:
                case SIOCGIFFLAGS:
                case SIOCGIFMETRIC:
                case SIOCGIFMTU:
@@ -1982,6 +1983,11 @@ COMPATIBLE_IOCTL(TUNSETNOCSUM)
 COMPATIBLE_IOCTL(TUNSETDEBUG)
 COMPATIBLE_IOCTL(TUNSETPERSIST)
 COMPATIBLE_IOCTL(TUNSETOWNER)
+COMPATIBLE_IOCTL(TUNSETLINK)
+COMPATIBLE_IOCTL(TUNSETGROUP)
+COMPATIBLE_IOCTL(TUNGETFEATURES)
+COMPATIBLE_IOCTL(TUNSETOFFLOAD)
+COMPATIBLE_IOCTL(TUNSETTXFILTER)
 /* Big V */
 COMPATIBLE_IOCTL(VT_SETMODE)
 COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2573,6 +2579,7 @@ HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
 HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
 HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
 HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
+HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
 HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
 HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
 HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
new file mode 100644
index 000000000000..13587cc97a0b
--- /dev/null
+++ b/fs/configfs/Kconfig
@@ -0,0 +1,11 @@
+config CONFIGFS_FS
+        tristate "Userspace-driven configuration filesystem"
+        depends on SYSFS
+        help
+          configfs is a ram-based filesystem that provides the converse
+          of sysfs's functionality. Where sysfs is a filesystem-based
+          view of kernel objects, configfs is a filesystem-based manager
+          of kernel objects, or config_items.
+          Both sysfs and configfs can and should exist together on the
+          same system. One is not a replacement for the other.
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e93341f3e82..9c2358391147 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -553,12 +553,24 @@ static void detach_groups(struct config_group *group)
                child = sd->s_dentry;
+                /*
+                 * Note: we hide this from lockdep since we have no way
+                 * to teach lockdep about recursive
+                 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+                 * in an inode tree, which are valid as soon as
+                 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+                 * parent inode to one of its children.
+                 */
+                lockdep_off();
                mutex_lock(&child->d_inode->i_mutex);
+                lockdep_on();
                configfs_detach_group(sd->s_element);
                child->d_inode->i_flags |= S_DEAD;
+                lockdep_off();
                mutex_unlock(&child->d_inode->i_mutex);
+                lockdep_on();
                d_delete(child);
                dput(child);
@@ -748,11 +760,22 @@ static int configfs_attach_item(struct config_item *parent_item,
                         * We are going to remove an inode and its dentry but
                         * the VFS may already have hit and used them. Thus,
                         * we must lock them as rmdir() would.
+                         *
+                         * Note: we hide this from lockdep since we have no way
+                         * to teach lockdep about recursive
+                         * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+                         * in an inode tree, which are valid as soon as
+                         * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+                         * parent inode to one of its children.
                         */
+                        lockdep_off();
                        mutex_lock(&dentry->d_inode->i_mutex);
+                        lockdep_on();
                        configfs_remove_dir(item);
                        dentry->d_inode->i_flags |= S_DEAD;
+                        lockdep_off();
                        mutex_unlock(&dentry->d_inode->i_mutex);
+                        lockdep_on();
                        d_delete(dentry);
                }
        }
@@ -787,14 +810,25 @@ static int configfs_attach_group(struct config_item *parent_item,
                 *
                 * We must also lock the inode to remove it safely in case of
                 * error, as rmdir() would.
+                 *
+                 * Note: we hide this from lockdep since we have no way
+                 * to teach lockdep about recursive
+                 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+                 * in an inode tree, which are valid as soon as
+                 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+                 * parent inode to one of its children.
                 */
+                lockdep_off();
                mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+                lockdep_on();
                ret = populate_groups(to_config_group(item));
                if (ret) {
                        configfs_detach_item(item);
                        dentry->d_inode->i_flags |= S_DEAD;
                }
+                lockdep_off();
                mutex_unlock(&dentry->d_inode->i_mutex);
+                lockdep_on();
                if (ret)
                        d_delete(dentry);
        }
@@ -956,7 +990,17 @@ static int configfs_depend_prep(struct dentry *origin,
        BUG_ON(!origin || !sd);
        /* Lock this guy on the way down */
+        /*
+         * Note: we hide this from lockdep since we have no way
+         * to teach lockdep about recursive
+         * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+         * in an inode tree, which are valid as soon as
+         * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+         * parent inode to one of its children.
+         */
+        lockdep_off();
        mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+        lockdep_on();
        if (sd->s_element == target)  /* Boo-yah */
                goto out;
@@ -970,7 +1014,9 @@ static int configfs_depend_prep(struct dentry *origin,
        }
        /* We looped all our children and didn't find target */
+        lockdep_off();
        mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
+        lockdep_on();
        ret = -ENOENT;
 out:
@@ -990,11 +1036,16 @@ static void configfs_depend_rollback(struct dentry *origin,
        struct dentry *dentry = item->ci_dentry;
        while (dentry != origin) {
+                /* See comments in configfs_depend_prep() */
+                lockdep_off();
                mutex_unlock(&dentry->d_inode->i_mutex);
+                lockdep_on();
                dentry = dentry->d_parent;
        }
+        lockdep_off();
        mutex_unlock(&origin->d_inode->i_mutex);
+        lockdep_on();
 }
 int configfs_depend_item(struct configfs_subsystem *subsys,
@@ -1329,8 +1380,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
                        }
                        /* Wait until the racing operation terminates */
+                        /*
+                         * Note: we hide this from lockdep since we are locked
+                         * with subclass I_MUTEX_NORMAL from vfs_rmdir() (why
+                         * not I_MUTEX_CHILD?), and I_MUTEX_XATTR or
+                         * I_MUTEX_QUOTA are not relevant for the locked inode.
+                         */
+                        lockdep_off();
                        mutex_lock(wait_mutex);
                        mutex_unlock(wait_mutex);
+                        lockdep_on();
                }
        } while (ret == -EAGAIN);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc94480..5d349d38e056 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
        inode->i_mode = mode;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
        struct inode * inode = new_inode(configfs_sb);
        if (inode) {
-                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &configfs_aops;
                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
new file mode 100644
index 000000000000..cd06466f365e
--- /dev/null
+++ b/fs/cramfs/Kconfig
@@ -0,0 +1,19 @@
+config CRAMFS
+        tristate "Compressed ROM file system support (cramfs)"
+        depends on BLOCK
+        select ZLIB_INFLATE
+        help
+          Saying Y here includes support for CramFs (Compressed ROM File
+          System).  CramFs is designed to be a simple, small, and compressed
+          file system for ROM based embedded systems.  CramFs is read-only,
+          limited to 256MB file systems (with 16MB files), and doesn't support
+          16/32 bits uid/gid, hard links and timestamps.
+          See <file:Documentation/filesystems/cramfs.txt> and
+          <file:fs/cramfs/README> for further information.
+          To compile this as a module, choose M here: the module will be called
+          cramfs.  Note that the root file system (the one containing the
+          directory /) cannot be compiled as a module.
+          If unsure, say N.
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a14..a07338d2d140 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_data.a_ops = &cramfs_aops;
                } else {
-                        inode->i_size = 0;
-                        inode->i_blocks = 0;
                        init_special_inode(inode, inode->i_mode,
                                old_decode_dev(cramfs_inode->size));
                }
diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7f3e66..937df0fb0da5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -34,7 +34,6 @@
 #include <linux/bootmem.h>
 #include "internal.h"
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
@@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
        dentry->d_mounted = 0;
-#ifdef CONFIG_PROFILING
-        dentry->d_cookie = NULL;
-#endif
        INIT_HLIST_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -1336,7 +1332,7 @@ err_out:
 *
 * Searches the children of the parent dentry for the name in question. If
 * the dentry is found its reference count is incremented and the dentry
- * is returned. The caller must use d_put to free the entry when it has
+ * is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned on failure.
 *
 * __d_lookup is dcache_lock free. The hash list is protected using RCU.
@@ -1571,10 +1567,6 @@ void d_rehash(struct dentry * entry)
        spin_unlock(&dcache_lock);
 }
-#define do_switch(x,y) do { \
-        __typeof__ (x) __tmp = x; \
-        x = y; y = __tmp; } while (0)
 /*
 * When switching names, the actual string doesn't strictly have to
 * be preserved in the target - because we're dropping the target
@@ -1593,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                        /*
                         * Both external: swap the pointers
                         */
-                        do_switch(target->d_name.name, dentry->d_name.name);
+                        swap(target->d_name.name, dentry->d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
@@ -1620,8 +1612,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                         */
                        memcpy(dentry->d_iname, target->d_name.name,
                                        target->d_name.len + 1);
+                        dentry->d_name.len = target->d_name.len;
+                        return;
                }
        }
+        swap(dentry->d_name.len, target->d_name.len);
 }
 /*
@@ -1681,8 +1676,7 @@ already_unhashed:
        /* Switch the names.. */
        switch_names(dentry, target);
-        do_switch(dentry->d_name.len, target->d_name.len);
+        swap(dentry->d_name.hash, target->d_name.hash);
-        do_switch(dentry->d_name.hash, target->d_name.hash);
        /* ... and switch the parents */
        if (IS_ROOT(dentry)) {
@@ -1690,7 +1684,7 @@ already_unhashed:
                target->d_parent = target;
                INIT_LIST_HEAD(&target->d_u.d_child);
        } else {
-                do_switch(dentry->d_parent, target->d_parent);
+                swap(dentry->d_parent, target->d_parent);
                /* And add them back to the (new) parent lists */
                list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1791,8 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        struct dentry *dparent, *aparent;
        switch_names(dentry, anon);
-        do_switch(dentry->d_name.len, anon->d_name.len);
+        swap(dentry->d_name.hash, anon->d_name.hash);
-        do_switch(dentry->d_name.hash, anon->d_name.hash);
        dparent = dentry->d_parent;
        aparent = anon->d_parent;
@@ -1911,7 +1904,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
 *
 * "buflen" should be positive. Caller holds the dcache_lock.
 *
@@ -1987,7 +1981,10 @@ Elong:
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the path was
+ * too long. Note: Callers should use the returned pointer, not the passed
+ * in buffer, to use the name! The implementation often starts at an offset
+ * into the buffer, and may leave 0 bytes at the start.
 *
 * "buflen" should be positive.
 */
@@ -2095,7 +2092,7 @@ Elong:
 *              return NULL;
 *      }
 */
-asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
+SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 {
        int error;
        struct path pwd, root;
@@ -2313,9 +2310,6 @@ static void __init dcache_init(void)
 /* SLAB cache for __getname() consumers */
 struct kmem_cache *names_cachep __read_mostly;
-/* SLAB cache for file structures */
-struct kmem_cache *filp_cachep __read_mostly;
 EXPORT_SYMBOL(d_genocide);
 void __init vfs_caches_init_early(void)
@@ -2337,9 +2331,6 @@ void __init vfs_caches_init(unsigned long mempages)
        names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
        dcache_init();
        inode_init();
        files_init(mempages);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 855d4b1d619a..a21cabdbd87b 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
 {
        struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
                                                        GFP_KERNEL);
+        struct dentry *d;
        if (!dcs)
                return NULL;
-        path->dentry->d_cookie = dcs;
+        d = path->dentry;
+        spin_lock(&d->d_lock);
+        d->d_flags |= DCACHE_COOKIE;
+        spin_unlock(&d->d_lock);
        dcs->path = *path;
        path_get(path);
        hash_dcookie(dcs);
@@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie)
                goto out;
        }
-        dcs = path->dentry->d_cookie;
+        if (path->dentry->d_flags & DCACHE_COOKIE) {
+                dcs = find_dcookie((unsigned long)path->dentry);
-        if (!dcs)
+        } else {
                dcs = alloc_dcookie(path);
+                if (!dcs) {
-        if (!dcs) {
+                        err = -ENOMEM;
-                err = -ENOMEM;
+                        goto out;
-                goto out;
+                }
        }
        *cookie = dcookie_value(dcs);
@@ -140,7 +145,7 @@ out:
 /* And here is where the userspace process can look up the cookie value
 * to retrieve the path.
 */
-asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user * buf, size_t len)
+SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
 {
        unsigned long cookie = (unsigned long)cookie64;
        int err = -EINVAL;
@@ -193,7 +198,13 @@ out:
        mutex_unlock(&dcookie_mutex);
        return err;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len)
+{
+        return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len);
+}
+SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie);
+#endif
 static int dcookie_init(void)
 {
@@ -251,7 +262,12 @@ out_kmem:
 static void free_dcookie(struct dcookie_struct * dcs)
 {
-        dcs->path.dentry->d_cookie = NULL;
+        struct dentry *d = dcs->path.dentry;
+        spin_lock(&d->d_lock);
+        d->d_flags &= ~DCACHE_COOKIE;
+        spin_unlock(&d->d_lock);
        path_put(&dcs->path);
        kmem_cache_free(dcookie_cache, dcs);
 }
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 159a5efd6a8a..33a90120f6ad 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
+static int debugfs_size_t_set(void *data, u64 val)
+{
+        *(size_t *)data = val;
+        return 0;
+}
+static int debugfs_size_t_get(void *data, u64 *val)
+{
+        *val = *(size_t *)data;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+                        "%llu\n");      /* %llu and %zu are more or less the same */
+/**
+ * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+                                     struct dentry *parent, size_t *value)
+{
+        return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_size_t);
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf36..81ae9ea3c6e1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
        if (inode) {
                inode->i_mode = mode;
-                inode->i_uid = 0;
-                inode->i_gid = 0;
-                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
                default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5d61b7c06e13..5f3231b9633f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,25 +27,32 @@
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 #define DEVPTS_DEFAULT_MODE 0600
+/*
+ * ptmx is a new node in /dev/pts and will be unused in legacy (single-
+ * instance) mode. To prevent surprises in user space, set permissions of
+ * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
+ * permissions.
+ */
+#define DEVPTS_DEFAULT_PTMX_MODE 0000
 #define PTMX_MINOR      2
 extern int pty_limit;                   /* Config limit on Unix98 ptys */
-static DEFINE_IDA(allocated_ptys);
 static DEFINE_MUTEX(allocated_ptys_lock);
 static struct vfsmount *devpts_mnt;
-static struct dentry *devpts_root;
-static struct {
+struct pts_mount_opts {
        int setuid;
        int setgid;
        uid_t   uid;
        gid_t   gid;
        umode_t mode;
-} config = {.mode = DEVPTS_DEFAULT_MODE};
+        umode_t ptmxmode;
+        int newinstance;
+};
 enum {
-        Opt_uid, Opt_gid, Opt_mode,
+        Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,
        Opt_err
 };
@@ -53,18 +60,50 @@ static const match_table_t tokens = {
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_mode, "mode=%o"},
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+        {Opt_ptmxmode, "ptmxmode=%o"},
+        {Opt_newinstance, "newinstance"},
+#endif
        {Opt_err, NULL}
 };
-static int devpts_remount(struct super_block *sb, int *flags, char *data)
+struct pts_fs_info {
+        struct ida allocated_ptys;
+        struct pts_mount_opts mount_opts;
+        struct dentry *ptmx_dentry;
+};
+static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+{
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+        if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+                return inode->i_sb;
+#endif
+        return devpts_mnt->mnt_sb;
+}
+#define PARSE_MOUNT     0
+#define PARSE_REMOUNT   1
+static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 {
        char *p;
-        config.setuid  = 0;
+        opts->setuid  = 0;
-        config.setgid  = 0;
+        opts->setgid  = 0;
-        config.uid     = 0;
+        opts->uid     = 0;
-        config.gid     = 0;
+        opts->gid     = 0;
-        config.mode    = DEVPTS_DEFAULT_MODE;
+        opts->mode    = DEVPTS_DEFAULT_MODE;
+        opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+        /* newinstance makes sense only on initial mount */
+        if (op == PARSE_MOUNT)
+                opts->newinstance = 0;
        while ((p = strsep(&data, ",")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
@@ -79,20 +118,32 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
                case Opt_uid:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
-                        config.uid = option;
+                        opts->uid = option;
-                        config.setuid = 1;
+                        opts->setuid = 1;
                        break;
                case Opt_gid:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
-                        config.gid = option;
+                        opts->gid = option;
-                        config.setgid = 1;
+                        opts->setgid = 1;
                        break;
                case Opt_mode:
                        if (match_octal(&args[0], &option))
                                return -EINVAL;
-                        config.mode = option & S_IALLUGO;
+                        opts->mode = option & S_IALLUGO;
+                        break;
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+                case Opt_ptmxmode:
+                        if (match_octal(&args[0], &option))
+                                return -EINVAL;
+                        opts->ptmxmode = option & S_IALLUGO;
+                        break;
+                case Opt_newinstance:
+                        /* newinstance makes sense only on initial mount */
+                        if (op == PARSE_MOUNT)
+                                opts->newinstance = 1;
                        break;
+#endif
                default:
                        printk(KERN_ERR "devpts: called with bogus options\n");
                        return -EINVAL;
@@ -102,13 +153,106 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int mknod_ptmx(struct super_block *sb)
+{
+        int mode;
+        int rc = -ENOMEM;
+        struct dentry *dentry;
+        struct inode *inode;
+        struct dentry *root = sb->s_root;
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
+        mutex_lock(&root->d_inode->i_mutex);
+        /* If we have already created ptmx node, return */
+        if (fsi->ptmx_dentry) {
+                rc = 0;
+                goto out;
+        }
+        dentry = d_alloc_name(root, "ptmx");
+        if (!dentry) {
+                printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n");
+                goto out;
+        }
+        /*
+         * Create a new 'ptmx' node in this mount of devpts.
+         */
+        inode = new_inode(sb);
+        if (!inode) {
+                printk(KERN_ERR "Unable to alloc inode for ptmx node\n");
+                dput(dentry);
+                goto out;
+        }
+        inode->i_ino = 2;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        mode = S_IFCHR|opts->ptmxmode;
+        init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+        d_add(dentry, inode);
+        fsi->ptmx_dentry = dentry;
+        rc = 0;
+        printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
+                        inode->i_ino);
+out:
+        mutex_unlock(&root->d_inode->i_mutex);
+        return rc;
+}
+static void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+        struct inode *inode;
+        if (fsi->ptmx_dentry) {
+                inode = fsi->ptmx_dentry->d_inode;
+                inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
+        }
+}
+#else
+static inline void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+       return;
+}
+#endif
+static int devpts_remount(struct super_block *sb, int *flags, char *data)
+{
+        int err;
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
+        err = parse_mount_options(data, PARSE_REMOUNT, opts);
+        /*
+         * parse_mount_options() restores options to default values
+         * before parsing and may have changed ptmxmode. So, update the
+         * mode in the inode too. Bogus options don't fail the remount,
+         * so do this even on error return.
+         */
+        update_ptmx_mode(fsi);
+        return err;
+}
 static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
-        if (config.setuid)
+        struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
-                seq_printf(seq, ",uid=%u", config.uid);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
-        if (config.setgid)
-                seq_printf(seq, ",gid=%u", config.gid);
+        if (opts->setuid)
-        seq_printf(seq, ",mode=%03o", config.mode);
+                seq_printf(seq, ",uid=%u", opts->uid);
+        if (opts->setgid)
+                seq_printf(seq, ",gid=%u", opts->gid);
+        seq_printf(seq, ",mode=%03o", opts->mode);
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+        seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
+#endif
        return 0;
 }
@@ -119,10 +263,25 @@ static const struct super_operations devpts_sops = {
        .show_options   = devpts_show_options,
 };
+static void *new_pts_fs_info(void)
+{
+        struct pts_fs_info *fsi;
+        fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+        if (!fsi)
+                return NULL;
+        ida_init(&fsi->allocated_ptys);
+        fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+        fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+        return fsi;
+}
 static int
 devpts_fill_super(struct super_block *s, void *data, int silent)
 {
-        struct inode * inode;
+        struct inode *inode;
        s->s_blocksize = 1024;
        s->s_blocksize_bits = 10;
@@ -130,39 +289,240 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
        s->s_op = &devpts_sops;
        s->s_time_gran = 1;
+        s->s_fs_info = new_pts_fs_info();
+        if (!s->s_fs_info)
+                goto fail;
        inode = new_inode(s);
        if (!inode)
-                goto fail;
+                goto free_fsi;
        inode->i_ino = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_blocks = 0;
-        inode->i_uid = inode->i_gid = 0;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
        inode->i_nlink = 2;
-        devpts_root = s->s_root = d_alloc_root(inode);
+        s->s_root = d_alloc_root(inode);
        if (s->s_root)
                return 0;
-        
-        printk("devpts: get root dentry failed\n");
+        printk(KERN_ERR "devpts: get root dentry failed\n");
        iput(inode);
+free_fsi:
+        kfree(s->s_fs_info);
 fail:
        return -ENOMEM;
 }
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int compare_init_pts_sb(struct super_block *s, void *p)
+{
+        if (devpts_mnt)
+                return devpts_mnt->mnt_sb == s;
+        return 0;
+}
+/*
+ * Safely parse the mount options in @data and update @opts.
+ *
+ * devpts ends up parsing options two times during mount, due to the
+ * two modes of operation it supports. The first parse occurs in
+ * devpts_get_sb() when determining the mode (single-instance or
+ * multi-instance mode). The second parse happens in devpts_remount()
+ * or new_pts_mount() depending on the mode.
+ *
+ * Parsing of options modifies the @data making subsequent parsing
+ * incorrect. So make a local copy of @data and parse it.
+ *
+ * Return: 0 On success, -errno on error
+ */
+static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts)
+{
+        int rc;
+        void *datacp;
+        if (!data)
+                return 0;
+        /* Use kstrdup() ?  */
+        datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!datacp)
+                return -ENOMEM;
+        memcpy(datacp, data, PAGE_SIZE);
+        rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
+        kfree(datacp);
+        return rc;
+}
+/*
+ * Mount a new (private) instance of devpts.  PTYs created in this
+ * instance are independent of the PTYs in other devpts instances.
+ */
+static int new_pts_mount(struct file_system_type *fs_type, int flags,
+                void *data, struct vfsmount *mnt)
+{
+        int err;
+        struct pts_fs_info *fsi;
+        struct pts_mount_opts *opts;
+        printk(KERN_NOTICE "devpts: newinstance mount\n");
+        err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
+        if (err)
+                return err;
+        fsi = DEVPTS_SB(mnt->mnt_sb);
+        opts = &fsi->mount_opts;
+        err = parse_mount_options(data, PARSE_MOUNT, opts);
+        if (err)
+                goto fail;
+        err = mknod_ptmx(mnt->mnt_sb);
+        if (err)
+                goto fail;
+        return 0;
+fail:
+        dput(mnt->mnt_sb->s_root);
+        deactivate_super(mnt->mnt_sb);
+        return err;
+}
+/*
+ * Check if 'newinstance' mount option was specified in @data.
+ *
+ * Return: -errno       on error (eg: invalid mount options specified)
+ *       : 1            if 'newinstance' mount option was specified
+ *       : 0            if 'newinstance' mount option was NOT specified
+ */
+static int is_new_instance_mount(void *data)
+{
+        int rc;
+        struct pts_mount_opts opts;
+        if (!data)
+                return 0;
+        rc = safe_parse_mount_options(data, &opts);
+        if (!rc)
+                rc = opts.newinstance;
+        return rc;
+}
+/*
+ * get_init_pts_sb()
+ *
+ *     This interface is needed to support multiple namespace semantics in
+ *     devpts while preserving backward compatibility of the current 'single-
+ *     namespace' semantics. i.e all mounts of devpts without the 'newinstance'
+ *     mount option should bind to the initial kernel mount, like
+ *     get_sb_single().
+ *
+ *     Mounts with 'newinstance' option create a new private namespace.
+ *
+ *     But for single-mount semantics, devpts cannot use get_sb_single(),
+ *     because get_sb_single()/sget() find and use the super-block from
+ *     the most recent mount of devpts. But that recent mount may be a
+ *     'newinstance' mount and get_sb_single() would pick the newinstance
+ *     super-block instead of the initial super-block.
+ *
+ *     This interface is identical to get_sb_single() except that it
+ *     consistently selects the 'single-namespace' superblock even in the
+ *     presence of the private namespace (i.e 'newinstance') super-blocks.
+ */
+static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
+                void *data, struct vfsmount *mnt)
+{
+        struct super_block *s;
+        int error;
+        s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
+        if (IS_ERR(s))
+                return PTR_ERR(s);
+        if (!s->s_root) {
+                s->s_flags = flags;
+                error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        return error;
+                }
+                s->s_flags |= MS_ACTIVE;
+        }
+        do_remount_sb(s, flags, data, 0);
+        return simple_set_mnt(mnt, s);
+}
+/*
+ * Mount or remount the initial kernel mount of devpts. This type of
+ * mount maintains the legacy, single-instance semantics, while the
+ * kernel still allows multiple-instances.
+ */
+static int init_pts_mount(struct file_system_type *fs_type, int flags,
+                void *data, struct vfsmount *mnt)
+{
+        int err;
+        err = get_init_pts_sb(fs_type, flags, data, mnt);
+        if (err)
+                return err;
+        err = mknod_ptmx(mnt->mnt_sb);
+        if (err) {
+                dput(mnt->mnt_sb->s_root);
+                deactivate_super(mnt->mnt_sb);
+        }
+        return err;
+}
 static int devpts_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
+        int new;
+        new = is_new_instance_mount(data);
+        if (new < 0)
+                return new;
+        if (new)
+                return new_pts_mount(fs_type, flags, data, mnt);
+        return init_pts_mount(fs_type, flags, data, mnt);
+}
+#else
+/*
+ * This supports only the legacy single-instance semantics (no
+ * multiple-instance semantics)
+ */
+static int devpts_get_sb(struct file_system_type *fs_type, int flags,
+                const char *dev_name, void *data, struct vfsmount *mnt)
+{
        return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
+#endif
+static void devpts_kill_sb(struct super_block *sb)
+{
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        kfree(fsi);
+        kill_litter_super(sb);
+}
 static struct file_system_type devpts_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "devpts",
        .get_sb         = devpts_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = devpts_kill_sb,
 };
 /*
@@ -172,16 +532,17 @@ static struct file_system_type devpts_fs_type = {
 int devpts_new_index(struct inode *ptmx_inode)
 {
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        int index;
        int ida_ret;
 retry:
-        if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) {
+        if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
                return -ENOMEM;
-        }
        mutex_lock(&allocated_ptys_lock);
-        ida_ret = ida_get_new(&allocated_ptys, &index);
+        ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
        if (ida_ret < 0) {
                mutex_unlock(&allocated_ptys_lock);
                if (ida_ret == -EAGAIN)
@@ -190,7 +551,7 @@ retry:
        }
        if (index >= pty_limit) {
-                ida_remove(&allocated_ptys, index);
+                ida_remove(&fsi->allocated_ptys, index);
                mutex_unlock(&allocated_ptys_lock);
                return -EIO;
        }
@@ -200,18 +561,26 @@ retry:
 void devpts_kill_index(struct inode *ptmx_inode, int idx)
 {
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        mutex_lock(&allocated_ptys_lock);
-        ida_remove(&allocated_ptys, idx);
+        ida_remove(&fsi->allocated_ptys, idx);
        mutex_unlock(&allocated_ptys_lock);
 }
 int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 {
-        int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
+        /* tty layer puts index from devpts_new_index() in here */
+        int number = tty->index;
        struct tty_driver *driver = tty->driver;
        dev_t device = MKDEV(driver->major, driver->minor_start+number);
        struct dentry *dentry;
-        struct inode *inode = new_inode(devpts_mnt->mnt_sb);
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        struct inode *inode = new_inode(sb);
+        struct dentry *root = sb->s_root;
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
        char s[12];
        /* We're supposed to be given the slave end of a pty */
@@ -221,25 +590,25 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
        if (!inode)
                return -ENOMEM;
-        inode->i_ino = number+2;
+        inode->i_ino = number + 3;
-        inode->i_uid = config.setuid ? config.uid : current_fsuid();
+        inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
-        inode->i_gid = config.setgid ? config.gid : current_fsgid();
+        inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        init_special_inode(inode, S_IFCHR|config.mode, device);
+        init_special_inode(inode, S_IFCHR|opts->mode, device);
        inode->i_private = tty;
        tty->driver_data = inode;
        sprintf(s, "%d", number);
-        mutex_lock(&devpts_root->d_inode->i_mutex);
+        mutex_lock(&root->d_inode->i_mutex);
-        dentry = d_alloc_name(devpts_root, s);
+        dentry = d_alloc_name(root, s);
        if (!IS_ERR(dentry)) {
                d_add(dentry, inode);
-                fsnotify_create(devpts_root->d_inode, dentry);
+                fsnotify_create(root->d_inode, dentry);
        }
-        mutex_unlock(&devpts_root->d_inode->i_mutex);
+        mutex_unlock(&root->d_inode->i_mutex);
        return 0;
 }
@@ -256,20 +625,27 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
 void devpts_pty_kill(struct tty_struct *tty)
 {
        struct inode *inode = tty->driver_data;
+        struct super_block *sb = pts_sb_from_inode(inode);
+        struct dentry *root = sb->s_root;
        struct dentry *dentry;
        BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
-        mutex_lock(&devpts_root->d_inode->i_mutex);
+        mutex_lock(&root->d_inode->i_mutex);
        dentry = d_find_alias(inode);
-        if (dentry && !IS_ERR(dentry)) {
+        if (IS_ERR(dentry))
+                goto out;
+        if (dentry) {
                inode->i_nlink--;
                d_delete(dentry);
-                dput(dentry);
+                dput(dentry);   /* d_alloc_name() in devpts_pty_new() */
        }
-        mutex_unlock(&devpts_root->d_inode->i_mutex);
+        dput(dentry);           /* d_find_alias above */
+out:
+        mutex_unlock(&root->d_inode->i_mutex);
 }
 static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558dbe8b7..b6d43908ff7a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        retval = direct_io_worker(rw, iocb, inode, iov, offset,
                                nr_segs, blkbits, get_block, end_io, dio);
+        /*
+         * In case of error extending write may have instantiated a few
+         * blocks outside i_size. Trim these off again for DIO_LOCKING.
+         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+         * it's own meaner.
+         */
+        if (unlikely(retval < 0 && (rw & WRITE))) {
+                loff_t isize = i_size_read(inode);
+                if (end > isize && dio_lock_type == DIO_LOCKING)
+                        vmtruncate(inode, isize);
+        }
        if (rw == READ && dio_lock_type == DIO_LOCKING)
                release_i_mutex = 0;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8bf31e3fbf01..dc2ad6008b2d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
        spin_unlock(&ast_queue_lock);
 }
-void dlm_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
        if (lkb->lkb_flags & DLM_IFL_USER) {
-                dlm_user_add_ast(lkb, type);
+                dlm_user_add_ast(lkb, type, bastmode);
                return;
        }
@@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type)
                list_add_tail(&lkb->lkb_astqueue, &ast_queue);
        }
        lkb->lkb_ast_type |= type;
+        if (bastmode)
+                lkb->lkb_bastmode = bastmode;
        spin_unlock(&ast_queue_lock);
        set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,50 +61,40 @@ static void process_asts(void)
        struct dlm_lkb *lkb;
        void (*cast) (void *astparam);
        void (*bast) (void *astparam, int mode);
-        int type = 0, found, bmode;
+        int type = 0, bastmode;
-        for (;;) {
+repeat:
-                found = 0;
+        spin_lock(&ast_queue_lock);
-                spin_lock(&ast_queue_lock);
+        list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
-                list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+                r = lkb->lkb_resource;
-                        r = lkb->lkb_resource;
+                ls = r->res_ls;
-                        ls = r->res_ls;
+                if (dlm_locking_stopped(ls))
-                        if (dlm_locking_stopped(ls))
+                        continue;
-                                continue;
-                        list_del(&lkb->lkb_astqueue);
-                        type = lkb->lkb_ast_type;
-                        lkb->lkb_ast_type = 0;
-                        found = 1;
-                        break;
-                }
-                spin_unlock(&ast_queue_lock);
-                if (!found)
+                list_del(&lkb->lkb_astqueue);
-                        break;
+                type = lkb->lkb_ast_type;
+                lkb->lkb_ast_type = 0;
+                bastmode = lkb->lkb_bastmode;
+                spin_unlock(&ast_queue_lock);
                cast = lkb->lkb_astfn;
                bast = lkb->lkb_bastfn;
-                bmode = lkb->lkb_bastmode;
                if ((type & AST_COMP) && cast)
                        cast(lkb->lkb_astparam);
-                /* FIXME: Is it safe to look at lkb_grmode here
-                   without doing a lock_rsb() ?
-                   Look at other checks in v1 to avoid basts. */
                if ((type & AST_BAST) && bast)
-                        if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
+                        bast(lkb->lkb_astparam, bastmode);
-                                bast(lkb->lkb_astparam, bmode);
                /* this removes the reference added by dlm_add_ast
                   and may result in the lkb being freed */
                dlm_put_lkb(lkb);
-                schedule();
+                cond_resched();
+                goto repeat;
        }
+        spin_unlock(&ast_queue_lock);
 }
 static inline int no_asts(void)
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 6ee276c74c52..1b5fc5f428fd 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
-void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 void dlm_del_ast(struct dlm_lkb *lkb);
 void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fc24f4507a3..1d1d27442235 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -25,19 +25,6 @@ static struct mutex debug_buf_lock;
 static struct dentry *dlm_root;
-struct rsb_iter {
-        int entry;
-        int locks;
-        int header;
-        struct dlm_ls *ls;
-        struct list_head *next;
-        struct dlm_rsb *rsb;
-};
-/*
- * dump all rsb's in the lockspace hash table
- */
 static char *print_lockmode(int mode)
 {
        switch (mode) {
@@ -60,13 +47,13 @@ static char *print_lockmode(int mode)
        }
 }
-static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
+static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
-                                struct dlm_rsb *res)
+                              struct dlm_rsb *res)
 {
        seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
-        if (lkb->lkb_status == DLM_LKSTS_CONVERT
+        if (lkb->lkb_status == DLM_LKSTS_CONVERT ||
-            || lkb->lkb_status == DLM_LKSTS_WAITING)
+            lkb->lkb_status == DLM_LKSTS_WAITING)
                seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
        if (lkb->lkb_nodeid) {
@@ -80,33 +67,42 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
        if (lkb->lkb_wait_type)
                seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
-        seq_printf(s, "\n");
+        return seq_printf(s, "\n");
 }
-static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+static int print_format1(struct dlm_rsb *res, struct seq_file *s)
 {
        struct dlm_lkb *lkb;
        int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+        int rv;
        lock_rsb(res);
-        seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+        rv = seq_printf(s, "\nResource %p Name (len=%d) \"",
+                        res, res->res_length);
+        if (rv)
+                goto out;
        for (i = 0; i < res->res_length; i++) {
                if (isprint(res->res_name[i]))
                        seq_printf(s, "%c", res->res_name[i]);
                else
                        seq_printf(s, "%c", '.');
        }
        if (res->res_nodeid > 0)
-                seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+                rv = seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
-                           res->res_nodeid);
+                                res->res_nodeid);
        else if (res->res_nodeid == 0)
-                seq_printf(s, "\"  \nMaster Copy\n");
+                rv = seq_printf(s, "\"  \nMaster Copy\n");
        else if (res->res_nodeid == -1)
-                seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
+                rv = seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
-                           res->res_first_lkid);
+                                res->res_first_lkid);
        else
-                seq_printf(s, "\"  \nInvalid master %d\n", res->res_nodeid);
+                rv = seq_printf(s, "\"  \nInvalid master %d\n",
+                                res->res_nodeid);
+        if (rv)
+                goto out;
        /* Print the LVB: */
        if (res->res_lvbptr) {
@@ -119,329 +115,489 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
                }
                if (rsb_flag(res, RSB_VALNOTVALID))
                        seq_printf(s, " (INVALID)");
-                seq_printf(s, "\n");
+                rv = seq_printf(s, "\n");
+                if (rv)
+                        goto out;
        }
        root_list = !list_empty(&res->res_root_list);
        recover_list = !list_empty(&res->res_recover_list);
        if (root_list || recover_list) {
-                seq_printf(s, "Recovery: root %d recover %d flags %lx "
+                rv = seq_printf(s, "Recovery: root %d recover %d flags %lx "
-                           "count %d\n", root_list, recover_list,
+                                "count %d\n", root_list, recover_list,
-                           res->res_flags, res->res_recover_locks_count);
+                                res->res_flags, res->res_recover_locks_count);
+                if (rv)
+                        goto out;
        }
        /* Print the locks attached to this resource */
        seq_printf(s, "Granted Queue\n");
-        list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
+        list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
-                print_resource_lock(s, lkb, res);
+                rv = print_format1_lock(s, lkb, res);
+                if (rv)
+                        goto out;
+        }
        seq_printf(s, "Conversion Queue\n");
-        list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
+        list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
-                print_resource_lock(s, lkb, res);
+                rv = print_format1_lock(s, lkb, res);
+                if (rv)
+                        goto out;
+        }
        seq_printf(s, "Waiting Queue\n");
-        list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
+        list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
-                print_resource_lock(s, lkb, res);
+                rv = print_format1_lock(s, lkb, res);
+                if (rv)
+                        goto out;
+        }
        if (list_empty(&res->res_lookup))
                goto out;
        seq_printf(s, "Lookup Queue\n");
        list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
-                seq_printf(s, "%08x %s", lkb->lkb_id,
+                rv = seq_printf(s, "%08x %s", lkb->lkb_id,
-                           print_lockmode(lkb->lkb_rqmode));
+                                print_lockmode(lkb->lkb_rqmode));
                if (lkb->lkb_wait_type)
                        seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
-                seq_printf(s, "\n");
+                rv = seq_printf(s, "\n");
        }
 out:
        unlock_rsb(res);
-        return 0;
+        return rv;
 }
-static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r)
+static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+                              struct dlm_rsb *r)
 {
-        unsigned int waiting = 0;
+        u64 xid = 0;
-        uint64_t xid = 0;
+        u64 us;
+        int rv;
        if (lkb->lkb_flags & DLM_IFL_USER) {
                if (lkb->lkb_ua)
                        xid = lkb->lkb_ua->xid;
        }
-        if (lkb->lkb_timestamp)
+        /* microseconds since lkb was added to current queue */
-                waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp);
+        us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
-        /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms
+        /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
           r_nodeid r_len r_name */
-        seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n",
+        rv = seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
-                   lkb->lkb_id,
+                        lkb->lkb_id,
-                   lkb->lkb_nodeid,
+                        lkb->lkb_nodeid,
-                   lkb->lkb_remid,
+                        lkb->lkb_remid,
-                   lkb->lkb_ownpid,
+                        lkb->lkb_ownpid,
-                   (unsigned long long)xid,
+                        (unsigned long long)xid,
-                   lkb->lkb_exflags,
+                        lkb->lkb_exflags,
-                   lkb->lkb_flags,
+                        lkb->lkb_flags,
-                   lkb->lkb_status,
+                        lkb->lkb_status,
-                   lkb->lkb_grmode,
+                        lkb->lkb_grmode,
-                   lkb->lkb_rqmode,
+                        lkb->lkb_rqmode,
-                   waiting,
+                        (unsigned long long)us,
-                   r->res_nodeid,
+                        r->res_nodeid,
-                   r->res_length,
+                        r->res_length,
-                   r->res_name);
+                        r->res_name);
+        return rv;
 }
-static int print_locks(struct dlm_rsb *r, struct seq_file *s)
+static int print_format2(struct dlm_rsb *r, struct seq_file *s)
 {
        struct dlm_lkb *lkb;
+        int rv = 0;
        lock_rsb(r);
-        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
-                print_lock(s, lkb, r);
+                rv = print_format2_lock(s, lkb, r);
+                if (rv)
-        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+                        goto out;
-                print_lock(s, lkb, r);
+        }
-        list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
-                print_lock(s, lkb, r);
+                rv = print_format2_lock(s, lkb, r);
+                if (rv)
+                        goto out;
+        }
+        list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+                rv = print_format2_lock(s, lkb, r);
+                if (rv)
+                        goto out;
+        }
+ out:
        unlock_rsb(r);
-        return 0;
+        return rv;
 }
-static int rsb_iter_next(struct rsb_iter *ri)
+static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+                              int rsb_lookup)
 {
-        struct dlm_ls *ls = ri->ls;
+        u64 xid = 0;
-        int i;
+        int rv;
-        if (!ri->next) {
- top:
-                /* Find the next non-empty hash bucket */
-                for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
-                        read_lock(&ls->ls_rsbtbl[i].lock);
-                        if (!list_empty(&ls->ls_rsbtbl[i].list)) {
-                                ri->next = ls->ls_rsbtbl[i].list.next;
-                                ri->rsb = list_entry(ri->next, struct dlm_rsb,
-                                                        res_hashchain);
-                                dlm_hold_rsb(ri->rsb);
-                                read_unlock(&ls->ls_rsbtbl[i].lock);
-                                break;
-                        }
-                        read_unlock(&ls->ls_rsbtbl[i].lock);
-                }
-                ri->entry = i;
-                if (ri->entry >= ls->ls_rsbtbl_size)
-                        return 1;
-        } else {
-                struct dlm_rsb *old = ri->rsb;
-                i = ri->entry;
-                read_lock(&ls->ls_rsbtbl[i].lock);
-                ri->next = ri->next->next;
-                if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
-                        /* End of list - move to next bucket */
-                        ri->next = NULL;
-                        ri->entry++;
-                        read_unlock(&ls->ls_rsbtbl[i].lock);
-                        dlm_put_rsb(old);
-                        goto top;
-                }
-                ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
-                dlm_hold_rsb(ri->rsb);
-                read_unlock(&ls->ls_rsbtbl[i].lock);
-                dlm_put_rsb(old);
-        }
-        return 0;
+        if (lkb->lkb_flags & DLM_IFL_USER) {
-}
+                if (lkb->lkb_ua)
+                        xid = lkb->lkb_ua->xid;
+        }
-static void rsb_iter_free(struct rsb_iter *ri)
+        rv = seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
-{
+                        lkb->lkb_id,
-        kfree(ri);
+                        lkb->lkb_nodeid,
+                        lkb->lkb_remid,
+                        lkb->lkb_ownpid,
+                        (unsigned long long)xid,
+                        lkb->lkb_exflags,
+                        lkb->lkb_flags,
+                        lkb->lkb_status,
+                        lkb->lkb_grmode,
+                        lkb->lkb_rqmode,
+                        lkb->lkb_highbast,
+                        rsb_lookup,
+                        lkb->lkb_wait_type,
+                        lkb->lkb_lvbseq,
+                        (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+                        (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+        return rv;
 }
-static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
+static int print_format3(struct dlm_rsb *r, struct seq_file *s)
 {
-        struct rsb_iter *ri;
+        struct dlm_lkb *lkb;
+        int i, lvblen = r->res_ls->ls_lvblen;
+        int print_name = 1;
+        int rv;
-        ri = kzalloc(sizeof *ri, GFP_KERNEL);
+        lock_rsb(r);
-        if (!ri)
-                return NULL;
-        ri->ls = ls;
+        rv = seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
-        ri->entry = 0;
+                        r,
-        ri->next = NULL;
+                        r->res_nodeid,
+                        r->res_first_lkid,
+                        r->res_flags,
+                        !list_empty(&r->res_root_list),
+                        !list_empty(&r->res_recover_list),
+                        r->res_recover_locks_count,
+                        r->res_length);
+        if (rv)
+                goto out;
-        if (rsb_iter_next(ri)) {
+        for (i = 0; i < r->res_length; i++) {
-                rsb_iter_free(ri);
+                if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
-                return NULL;
+                        print_name = 0;
        }
-        return ri;
+        seq_printf(s, "%s", print_name ? "str " : "hex");
-}
-static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
+        for (i = 0; i < r->res_length; i++) {
-{
+                if (print_name)
-        struct rsb_iter *ri;
+                        seq_printf(s, "%c", r->res_name[i]);
-        loff_t n = *pos;
+                else
+                        seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+        }
+        rv = seq_printf(s, "\n");
+        if (rv)
+                goto out;
-        ri = rsb_iter_init(file->private);
+        if (!r->res_lvbptr)
-        if (!ri)
+                goto do_locks;
-                return NULL;
-        while (n--) {
+        seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
-                if (rsb_iter_next(ri)) {
-                        rsb_iter_free(ri);
-                        return NULL;
-                }
-        }
-        return ri;
+        for (i = 0; i < lvblen; i++)
-}
+                seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
+        rv = seq_printf(s, "\n");
+        if (rv)
+                goto out;
-static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
+ do_locks:
-{
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
-        struct rsb_iter *ri = iter_ptr;
+                rv = print_format3_lock(s, lkb, 0);
+                if (rv)
+                        goto out;
+        }
-        (*pos)++;
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+                rv = print_format3_lock(s, lkb, 0);
+                if (rv)
+                        goto out;
+        }
-        if (rsb_iter_next(ri)) {
+        list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
-                rsb_iter_free(ri);
+                rv = print_format3_lock(s, lkb, 0);
-                return NULL;
+                if (rv)
+                        goto out;
        }
-        return ri;
+        list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) {
+                rv = print_format3_lock(s, lkb, 1);
+                if (rv)
+                        goto out;
+        }
+ out:
+        unlock_rsb(r);
+        return rv;
 }
-static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
+struct rsbtbl_iter {
-{
+        struct dlm_rsb *rsb;
-        /* nothing for now */
+        unsigned bucket;
-}
+        int format;
+        int header;
+};
-static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
+/* seq_printf returns -1 if the buffer is full, and 0 otherwise.
-{
+   If the buffer is full, seq_printf can be called again, but it
-        struct rsb_iter *ri = iter_ptr;
+   does nothing and just returns -1.  So, the these printing routines
+   periodically check the return value to avoid wasting too much time
+   trying to print to a full buffer. */
-        if (ri->locks) {
+static int table_seq_show(struct seq_file *seq, void *iter_ptr)
+{
+        struct rsbtbl_iter *ri = iter_ptr;
+        int rv = 0;
+        switch (ri->format) {
+        case 1:
+                rv = print_format1(ri->rsb, seq);
+                break;
+        case 2:
                if (ri->header) {
-                        seq_printf(file, "id nodeid remid pid xid exflags flags "
+                        seq_printf(seq, "id nodeid remid pid xid exflags "
-                                         "sts grmode rqmode time_ms r_nodeid "
+                                        "flags sts grmode rqmode time_ms "
-                                         "r_len r_name\n");
+                                        "r_nodeid r_len r_name\n");
                        ri->header = 0;
                }
-                print_locks(ri->rsb, file);
+                rv = print_format2(ri->rsb, seq);
-        } else {
+                break;
-                print_resource(ri->rsb, file);
+        case 3:
+                if (ri->header) {
+                        seq_printf(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+                        ri->header = 0;
+                }
+                rv = print_format3(ri->rsb, seq);
+                break;
        }
-        return 0;
+        return rv;
 }
-static struct seq_operations rsb_seq_ops = {
+static struct seq_operations format1_seq_ops;
-        .start = rsb_seq_start,
+static struct seq_operations format2_seq_ops;
-        .next  = rsb_seq_next,
+static struct seq_operations format3_seq_ops;
-        .stop  = rsb_seq_stop,
-        .show  = rsb_seq_show,
-};
-static int rsb_open(struct inode *inode, struct file *file)
+static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 {
-        struct seq_file *seq;
+        struct dlm_ls *ls = seq->private;
-        int ret;
+        struct rsbtbl_iter *ri;
+        struct dlm_rsb *r;
-        ret = seq_open(file, &rsb_seq_ops);
+        loff_t n = *pos;
-        if (ret)
+        unsigned bucket, entry;
-                return ret;
-        seq = file->private_data;
-        seq->private = inode->i_private;
-        return 0;
-}
-static const struct file_operations rsb_fops = {
-        .owner   = THIS_MODULE,
-        .open    = rsb_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = seq_release
-};
-/*
+        bucket = n >> 32;
- * Dump state in compact per-lock listing
+        entry = n & ((1LL << 32) - 1);
- */
-static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
+        if (bucket >= ls->ls_rsbtbl_size)
-{
+                return NULL;
-        struct rsb_iter *ri;
-        ri = kzalloc(sizeof *ri, GFP_KERNEL);
+        ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
        if (!ri)
                return NULL;
+        if (n == 0)
+                ri->header = 1;
+        if (seq->op == &format1_seq_ops)
+                ri->format = 1;
+        if (seq->op == &format2_seq_ops)
+                ri->format = 2;
+        if (seq->op == &format3_seq_ops)
+                ri->format = 3;
+        spin_lock(&ls->ls_rsbtbl[bucket].lock);
+        if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+                list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
+                                    res_hashchain) {
+                        if (!entry--) {
+                                dlm_hold_rsb(r);
+                                ri->rsb = r;
+                                ri->bucket = bucket;
+                                spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+                                return ri;
+                        }
+                }
+        }
+        spin_unlock(&ls->ls_rsbtbl[bucket].lock);
-        ri->ls = ls;
+        /*
-        ri->entry = 0;
+         * move to the first rsb in the next non-empty bucket
-        ri->next = NULL;
+         */
-        ri->locks = 1;
-        if (*pos == 0)
+        /* zero the entry */
-                ri->header = 1;
+        n &= ~((1LL << 32) - 1);
-        if (rsb_iter_next(ri)) {
+        while (1) {
-                rsb_iter_free(ri);
+                bucket++;
-                return NULL;
+                n += 1LL << 32;
-        }
+                if (bucket >= ls->ls_rsbtbl_size) {
+                        kfree(ri);
+                        return NULL;
+                }
-        return ri;
+                spin_lock(&ls->ls_rsbtbl[bucket].lock);
+                if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+                        r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+                                             struct dlm_rsb, res_hashchain);
+                        dlm_hold_rsb(r);
+                        ri->rsb = r;
+                        ri->bucket = bucket;
+                        spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+                        *pos = n;
+                        return ri;
+                }
+                spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+        }
 }
-static void *locks_seq_start(struct seq_file *file, loff_t *pos)
+static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 {
-        struct rsb_iter *ri;
+        struct dlm_ls *ls = seq->private;
+        struct rsbtbl_iter *ri = iter_ptr;
+        struct list_head *next;
+        struct dlm_rsb *r, *rp;
        loff_t n = *pos;
+        unsigned bucket;
+        bucket = n >> 32;
+        /*
+         * move to the next rsb in the same bucket
+         */
+        spin_lock(&ls->ls_rsbtbl[bucket].lock);
+        rp = ri->rsb;
+        next = rp->res_hashchain.next;
+        if (next != &ls->ls_rsbtbl[bucket].list) {
+                r = list_entry(next, struct dlm_rsb, res_hashchain);
+                dlm_hold_rsb(r);
+                ri->rsb = r;
+                spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+                dlm_put_rsb(rp);
+                ++*pos;
+                return ri;
+        }
+        spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+        dlm_put_rsb(rp);
-        ri = locks_iter_init(file->private, pos);
+        /*
-        if (!ri)
+         * move to the first rsb in the next non-empty bucket
-                return NULL;
+         */
+        /* zero the entry */
+        n &= ~((1LL << 32) - 1);
-        while (n--) {
+        while (1) {
-                if (rsb_iter_next(ri)) {
+                bucket++;
-                        rsb_iter_free(ri);
+                n += 1LL << 32;
+                if (bucket >= ls->ls_rsbtbl_size) {
+                        kfree(ri);
                        return NULL;
                }
+                spin_lock(&ls->ls_rsbtbl[bucket].lock);
+                if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+                        r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+                                             struct dlm_rsb, res_hashchain);
+                        dlm_hold_rsb(r);
+                        ri->rsb = r;
+                        ri->bucket = bucket;
+                        spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+                        *pos = n;
+                        return ri;
+                }
+                spin_unlock(&ls->ls_rsbtbl[bucket].lock);
        }
+}
+static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
+{
+        struct rsbtbl_iter *ri = iter_ptr;
-        return ri;
+        if (ri) {
+                dlm_put_rsb(ri->rsb);
+                kfree(ri);
+        }
 }
-static struct seq_operations locks_seq_ops = {
+static struct seq_operations format1_seq_ops = {
-        .start = locks_seq_start,
+        .start = table_seq_start,
-        .next  = rsb_seq_next,
+        .next  = table_seq_next,
-        .stop  = rsb_seq_stop,
+        .stop  = table_seq_stop,
-        .show  = rsb_seq_show,
+        .show  = table_seq_show,
+};
+static struct seq_operations format2_seq_ops = {
+        .start = table_seq_start,
+        .next  = table_seq_next,
+        .stop  = table_seq_stop,
+        .show  = table_seq_show,
 };
-static int locks_open(struct inode *inode, struct file *file)
+static struct seq_operations format3_seq_ops = {
+        .start = table_seq_start,
+        .next  = table_seq_next,
+        .stop  = table_seq_stop,
+        .show  = table_seq_show,
+};
+static const struct file_operations format1_fops;
+static const struct file_operations format2_fops;
+static const struct file_operations format3_fops;
+static int table_open(struct inode *inode, struct file *file)
 {
        struct seq_file *seq;
-        int ret;
+        int ret = -1;
+        if (file->f_op == &format1_fops)
+                ret = seq_open(file, &format1_seq_ops);
+        else if (file->f_op == &format2_fops)
+                ret = seq_open(file, &format2_seq_ops);
+        else if (file->f_op == &format3_fops)
+                ret = seq_open(file, &format3_seq_ops);
-        ret = seq_open(file, &locks_seq_ops);
        if (ret)
                return ret;
        seq = file->private_data;
-        seq->private = inode->i_private;
+        seq->private = inode->i_private; /* the dlm_ls */
        return 0;
 }
-static const struct file_operations locks_fops = {
+static const struct file_operations format1_fops = {
        .owner   = THIS_MODULE,
-        .open    = locks_open,
+        .open    = table_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release
+};
+static const struct file_operations format2_fops = {
+        .owner   = THIS_MODULE,
+        .open    = table_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release
+};
+static const struct file_operations format3_fops = {
+        .owner   = THIS_MODULE,
+        .open    = table_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release
@@ -489,30 +645,33 @@ static const struct file_operations waiters_fops = {
        .read    = waiters_read
 };
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+        if (ls->ls_debug_rsb_dentry)
+                debugfs_remove(ls->ls_debug_rsb_dentry);
+        if (ls->ls_debug_waiters_dentry)
+                debugfs_remove(ls->ls_debug_waiters_dentry);
+        if (ls->ls_debug_locks_dentry)
+                debugfs_remove(ls->ls_debug_locks_dentry);
+        if (ls->ls_debug_all_dentry)
+                debugfs_remove(ls->ls_debug_all_dentry);
+}
 int dlm_create_debug_file(struct dlm_ls *ls)
 {
        char name[DLM_LOCKSPACE_LEN+8];
+        /* format 1 */
        ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
                                                      S_IFREG | S_IRUGO,
                                                      dlm_root,
                                                      ls,
-                                                      &rsb_fops);
+                                                      &format1_fops);
        if (!ls->ls_debug_rsb_dentry)
-                return -ENOMEM;
+                goto fail;
-        memset(name, 0, sizeof(name));
+        /* format 2 */
-        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
-        ls->ls_debug_waiters_dentry = debugfs_create_file(name,
-                                                          S_IFREG | S_IRUGO,
-                                                          dlm_root,
-                                                          ls,
-                                                          &waiters_fops);
-        if (!ls->ls_debug_waiters_dentry) {
-                debugfs_remove(ls->ls_debug_rsb_dentry);
-                return -ENOMEM;
-        }
        memset(name, 0, sizeof(name));
        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
@@ -521,24 +680,39 @@ int dlm_create_debug_file(struct dlm_ls *ls)
                                                        S_IFREG | S_IRUGO,
                                                        dlm_root,
                                                        ls,
-                                                        &locks_fops);
+                                                        &format2_fops);
-        if (!ls->ls_debug_locks_dentry) {
+        if (!ls->ls_debug_locks_dentry)
-                debugfs_remove(ls->ls_debug_waiters_dentry);
+                goto fail;
-                debugfs_remove(ls->ls_debug_rsb_dentry);
-                return -ENOMEM;
+        /* format 3 */
-        }
+        memset(name, 0, sizeof(name));
+        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
+        ls->ls_debug_all_dentry = debugfs_create_file(name,
+                                                      S_IFREG | S_IRUGO,
+                                                      dlm_root,
+                                                      ls,
+                                                      &format3_fops);
+        if (!ls->ls_debug_all_dentry)
+                goto fail;
+        memset(name, 0, sizeof(name));
+        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+        ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+                                                          S_IFREG | S_IRUGO,
+                                                          dlm_root,
+                                                          ls,
+                                                          &waiters_fops);
+        if (!ls->ls_debug_waiters_dentry)
+                goto fail;
        return 0;
-}
-void dlm_delete_debug_file(struct dlm_ls *ls)
+ fail:
-{
+        dlm_delete_debug_file(ls);
-        if (ls->ls_debug_rsb_dentry)
+        return -ENOMEM;
-                debugfs_remove(ls->ls_debug_rsb_dentry);
-        if (ls->ls_debug_waiters_dentry)
-                debugfs_remove(ls->ls_debug_waiters_dentry);
-        if (ls->ls_debug_locks_dentry)
-                debugfs_remove(ls->ls_debug_locks_dentry);
 }
 int __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 85defeb64df4..92969f879a17 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
        struct list_head *list;
        struct dlm_rsb *r;
        int offset = 0, dir_nodeid;
-        uint16_t be_namelen;
+        __be16 be_namelen;
        down_read(&ls->ls_root_sem);
@@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
                if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
                        /* Write end-of-block record */
-                        be_namelen = 0;
+                        be_namelen = cpu_to_be16(0);
-                        memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                        memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
-                        offset += sizeof(uint16_t);
+                        offset += sizeof(__be16);
                        goto out;
                }
                be_namelen = cpu_to_be16(r->res_length);
-                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
-                offset += sizeof(uint16_t);
+                offset += sizeof(__be16);
                memcpy(outbuf + offset, r->res_name, r->res_length);
                offset += r->res_length;
        }
@@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
        if ((list == &ls->ls_root_list) &&
            (offset + sizeof(uint16_t) <= outlen)) {
-                be_namelen = 0xFFFF;
+                be_namelen = cpu_to_be16(0xFFFF);
-                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
-                offset += sizeof(uint16_t);
+                offset += sizeof(__be16);
        }
 out:
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 868e4c9ef127..076e86f38bc8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -105,7 +105,7 @@ struct dlm_dirtable {
 struct dlm_rsbtable {
        struct list_head        list;
        struct list_head        toss;
-        rwlock_t                lock;
+        spinlock_t              lock;
 };
 struct dlm_lkbtable {
@@ -245,7 +245,8 @@ struct dlm_lkb {
        struct list_head        lkb_astqueue;   /* need ast to be sent */
        struct list_head        lkb_ownqueue;   /* list of locks for a process */
        struct list_head        lkb_time_list;
-        unsigned long           lkb_timestamp;
+        ktime_t                 lkb_time_bast;  /* for debugging */
+        ktime_t                 lkb_timestamp;
        unsigned long           lkb_timeout_cs;
        char                    *lkb_lvbptr;
@@ -481,6 +482,7 @@ struct dlm_ls {
        struct dentry           *ls_debug_rsb_dentry; /* debugfs */
        struct dentry           *ls_debug_waiters_dentry; /* debugfs */
        struct dentry           *ls_debug_locks_dentry; /* debugfs */
+        struct dentry           *ls_debug_all_dentry; /* debugfs */
        wait_queue_head_t       ls_uevent_wait; /* user part of join/leave */
        int                     ls_uevent_result;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 724ddac91538..01e7d39c5fba 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
        lkb->lkb_lksb->sb_status = rv;
        lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
-        dlm_add_ast(lkb, AST_COMP);
+        dlm_add_ast(lkb, AST_COMP, 0);
 }
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
+        lkb->lkb_time_bast = ktime_get();
        if (is_master_copy(lkb))
                send_bast(r, lkb, rqmode);
-        else {
+        else
-                lkb->lkb_bastmode = rqmode;
+                dlm_add_ast(lkb, AST_BAST, rqmode);
-                dlm_add_ast(lkb, AST_BAST);
-        }
 }
 /*
@@ -412,9 +412,9 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
                      unsigned int flags, struct dlm_rsb **r_ret)
 {
        int error;
-        write_lock(&ls->ls_rsbtbl[b].lock);
+        spin_lock(&ls->ls_rsbtbl[b].lock);
        error = _search_rsb(ls, name, len, b, flags, r_ret);
-        write_unlock(&ls->ls_rsbtbl[b].lock);
+        spin_unlock(&ls->ls_rsbtbl[b].lock);
        return error;
 }
@@ -478,16 +478,16 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
                r->res_nodeid = nodeid;
        }
-        write_lock(&ls->ls_rsbtbl[bucket].lock);
+        spin_lock(&ls->ls_rsbtbl[bucket].lock);
        error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
        if (!error) {
-                write_unlock(&ls->ls_rsbtbl[bucket].lock);
+                spin_unlock(&ls->ls_rsbtbl[bucket].lock);
                dlm_free_rsb(r);
                r = tmp;
                goto out;
        }
        list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
-        write_unlock(&ls->ls_rsbtbl[bucket].lock);
+        spin_unlock(&ls->ls_rsbtbl[bucket].lock);
        error = 0;
 out:
        *r_ret = r;
@@ -530,9 +530,9 @@ static void put_rsb(struct dlm_rsb *r)
        struct dlm_ls *ls = r->res_ls;
        uint32_t bucket = r->res_bucket;
-        write_lock(&ls->ls_rsbtbl[bucket].lock);
+        spin_lock(&ls->ls_rsbtbl[bucket].lock);
        kref_put(&r->res_ref, toss_rsb);
-        write_unlock(&ls->ls_rsbtbl[bucket].lock);
+        spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 }
 void dlm_put_rsb(struct dlm_rsb *r)
@@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
        DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+        lkb->lkb_timestamp = ktime_get();
        lkb->lkb_status = status;
        switch (status) {
@@ -965,7 +967,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
        for (;;) {
                found = 0;
-                write_lock(&ls->ls_rsbtbl[b].lock);
+                spin_lock(&ls->ls_rsbtbl[b].lock);
                list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
                                            res_hashchain) {
                        if (!time_after_eq(jiffies, r->res_toss_time +
@@ -976,20 +978,20 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
                }
                if (!found) {
-                        write_unlock(&ls->ls_rsbtbl[b].lock);
+                        spin_unlock(&ls->ls_rsbtbl[b].lock);
                        break;
                }
                if (kref_put(&r->res_ref, kill_rsb)) {
                        list_del(&r->res_hashchain);
-                        write_unlock(&ls->ls_rsbtbl[b].lock);
+                        spin_unlock(&ls->ls_rsbtbl[b].lock);
                        if (is_master(r))
                                dir_remove(r);
                        dlm_free_rsb(r);
                        count++;
                } else {
-                        write_unlock(&ls->ls_rsbtbl[b].lock);
+                        spin_unlock(&ls->ls_rsbtbl[b].lock);
                        log_error(ls, "tossed rsb in use %s", r->res_name);
                }
        }
@@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb)
 {
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
-        if (is_master_copy(lkb)) {
+        if (is_master_copy(lkb))
-                lkb->lkb_timestamp = jiffies;
                return;
-        }
        if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
            !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
@@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb)
        DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
        mutex_lock(&ls->ls_timeout_mutex);
        hold_lkb(lkb);
-        lkb->lkb_timestamp = jiffies;
        list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
        mutex_unlock(&ls->ls_timeout_mutex);
 }
@@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls)
        struct dlm_rsb *r;
        struct dlm_lkb *lkb;
        int do_cancel, do_warn;
+        s64 wait_us;
        for (;;) {
                if (dlm_locking_stopped(ls))
@@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls)
                mutex_lock(&ls->ls_timeout_mutex);
                list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+                        wait_us = ktime_to_us(ktime_sub(ktime_get(),
+                                                        lkb->lkb_timestamp));
                        if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
-                            time_after_eq(jiffies, lkb->lkb_timestamp +
+                            wait_us >= (lkb->lkb_timeout_cs * 10000))
-                                          lkb->lkb_timeout_cs * HZ/100))
                                do_cancel = 1;
                        if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
-                            time_after_eq(jiffies, lkb->lkb_timestamp +
+                            wait_us >= dlm_config.ci_timewarn_cs * 10000)
-                                           dlm_config.ci_timewarn_cs * HZ/100))
                                do_warn = 1;
                        if (!do_cancel && !do_warn)
@@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls)
 void dlm_adjust_timeouts(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
-        long adj = jiffies - ls->ls_recover_begin;
+        u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
        ls->ls_recover_begin = 0;
        mutex_lock(&ls->ls_timeout_mutex);
        list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
-                lkb->lkb_timestamp += adj;
+                lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
        mutex_unlock(&ls->ls_timeout_mutex);
 }
@@ -4223,7 +4224,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
 {
        struct dlm_rsb *r, *r_ret = NULL;
-        read_lock(&ls->ls_rsbtbl[bucket].lock);
+        spin_lock(&ls->ls_rsbtbl[bucket].lock);
        list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
                if (!rsb_flag(r, RSB_LOCKS_PURGED))
                        continue;
@@ -4232,7 +4233,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
                r_ret = r;
                break;
        }
-        read_unlock(&ls->ls_rsbtbl[bucket].lock);
+        spin_unlock(&ls->ls_rsbtbl[bucket].lock);
        return r_ret;
 }
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 8d86b7960f0d..aa32e5f02493 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -464,7 +464,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        for (i = 0; i < size; i++) {
                INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
                INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
-                rwlock_init(&ls->ls_rsbtbl[i].lock);
+                spin_lock_init(&ls->ls_rsbtbl[i].lock);
        }
        size = dlm_config.ci_lkbtbl_size;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3962262f991a..103a5ebd1371 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con)
        con->sock->sk->sk_write_space = lowcomms_write_space;
        con->sock->sk->sk_state_change = lowcomms_state_change;
        con->sock->sk->sk_user_data = con;
+        con->sock->sk->sk_allocation = GFP_NOFS;
        return 0;
 }
@@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con)
        len = e->len;
        offset = e->offset;
        spin_unlock(&con->writequeue_lock);
-        kmap(e->page);
        /* Send the first block off the write queue */
        iov[0].iov_base = page_address(e->page)+offset;
@@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con)
                if (e->len == 0 && e->users == 0) {
                        list_del(&e->list);
-                        kunmap(e->page);
                        free_entry(e);
                }
                spin_unlock(&con->writequeue_lock);
@@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
        if (e) {
        got_one:
-                if (users == 0)
-                        kmap(e->page);
                *ppc = page_address(e->page) + offset;
                return e;
        }
@@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
        if (users)
                goto out;
        e->len = e->end - e->offset;
-        kunmap(e->page);
        spin_unlock(&con->writequeue_lock);
        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
@@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con)
                offset = e->offset;
                BUG_ON(len == 0 && e->users == 0);
                spin_unlock(&con->writequeue_lock);
-                kmap(e->page);
                ret = 0;
                if (len) {
@@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con)
                if (e->len == 0 && e->users == 0) {
                        list_del(&e->list);
-                        kunmap(e->page);
                        free_entry(e);
                        continue;
                }
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 54c14c6d06cb..c1775b84ebab 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
        char *p;
-        p = kzalloc(ls->ls_lvblen, GFP_KERNEL);
+        p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
        return p;
 }
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
-        r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL);
+        r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
        return r;
 }
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
-        lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL);
+        lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
        return lkb;
 }
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 07ac709f3ed7..f3396c622aec 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
                   ordinary messages). */
                if (msglen > sizeof(__tmp) && p == &__tmp.p) {
-                        p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+                        p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
                        if (p == NULL)
                                return ret;
                }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index aa2a5775a027..ccc9d62c462d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
        data->status = lkb->lkb_status;
        data->grmode = lkb->lkb_grmode;
        data->rqmode = lkb->lkb_rqmode;
-        data->timestamp = lkb->lkb_timestamp;
        if (lkb->lkb_ua)
                data->xid = lkb->lkb_ua->xid;
        if (r) {
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index eba87ff3177b..894a32d438d5 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -168,7 +168,7 @@ static int dlm_plock_callback(struct plock_op *op)
        notify = xop->callback;
        if (op->info.rv) {
-                notify(flc, NULL, op->info.rv);
+                notify(fl, NULL, op->info.rv);
                goto out;
        }
@@ -187,7 +187,7 @@ static int dlm_plock_callback(struct plock_op *op)
                          (unsigned long long)op->info.number, file, fl);
        }
-        rv = notify(flc, NULL, 0);
+        rv = notify(fl, NULL, 0);
        if (rv) {
                /* XXX: We need to cancel the fs lock here: */
                log_print("dlm_plock_callback: lock granted after lock request "
@@ -304,7 +304,9 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        if (rv == -ENOENT)
                rv = 0;
        else if (rv > 0) {
+                locks_init_lock(fl);
                fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+                fl->fl_flags = FL_POSIX;
                fl->fl_pid = op->info.pid;
                fl->fl_start = op->info.start;
                fl->fl_end = op->info.end;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 80aba5bdd4a4..eda43f362616 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -726,7 +726,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
        }
        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
-                read_lock(&ls->ls_rsbtbl[i].lock);
+                spin_lock(&ls->ls_rsbtbl[i].lock);
                list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
                        list_add(&r->res_root_list, &ls->ls_root_list);
                        dlm_hold_rsb(r);
@@ -737,7 +737,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
                   but no other recovery steps should do anything with them. */
                if (dlm_no_directory(ls)) {
-                        read_unlock(&ls->ls_rsbtbl[i].lock);
+                        spin_unlock(&ls->ls_rsbtbl[i].lock);
                        continue;
                }
@@ -745,7 +745,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
                        list_add(&r->res_root_list, &ls->ls_root_list);
                        dlm_hold_rsb(r);
                }
-                read_unlock(&ls->ls_rsbtbl[i].lock);
+                spin_unlock(&ls->ls_rsbtbl[i].lock);
        }
 out:
        up_write(&ls->ls_root_sem);
@@ -775,7 +775,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
        int i;
        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
-                write_lock(&ls->ls_rsbtbl[i].lock);
+                spin_lock(&ls->ls_rsbtbl[i].lock);
                list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
                                         res_hashchain) {
                        if (dlm_no_directory(ls) || !is_master(r)) {
@@ -783,7 +783,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
                                dlm_free_rsb(r);
                        }
                }
-                write_unlock(&ls->ls_rsbtbl[i].lock);
+                spin_unlock(&ls->ls_rsbtbl[i].lock);
        }
 }
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b3832c67194a..065149e84f42 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
   being removed and then remove that lkb from the orphans list and free it */
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
        struct dlm_ls *ls;
        struct dlm_user_args *ua;
@@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
        ast_type = lkb->lkb_ast_type;
        lkb->lkb_ast_type |= type;
+        if (bastmode)
+                lkb->lkb_bastmode = bastmode;
        if (!ast_type) {
                kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 35eb6a13d616..1c9686492286 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,7 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581c..bca3cac4bee7 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -87,14 +87,17 @@
 #define __DQUOT_PARANOIA
 /*
- * There are two quota SMP locks. dq_list_lock protects all lists with quotas
+ * There are three quota SMP locks. dq_list_lock protects all lists with quotas
- * and quota formats and also dqstats structure containing statistics about the
+ * and quota formats, dqstats structure containing statistics about the lists
- * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures
+ * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
- * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
+ * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
- * in inode_add_bytes() and inode_sub_bytes().
+ * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects
+ * modifications of quota state (on quotaon and quotaoff) and readers who care
+ * about latest values take it as well.
 *
- * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock
+ * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock,
+ *   dq_list_lock > dq_state_lock
 *
 * Note that some things (eg. sb pointer, type, id) doesn't change during
 * the life of the dquot structure and so needn't to be protected by a lock
@@ -103,12 +106,7 @@
 * operation is just reading pointers from inode (or not using them at all) the
 * read lock is enough. If pointers are altered function must hold write lock
 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
- * for altering the flag i_mutex is also needed).  If operation is holding
+ * for altering the flag i_mutex is also needed).
- * reference to dquot in other way (e.g. quotactl ops) it must be guarded by
- * dqonoff_mutex.
- * This locking assures that:
- *   a) update/access to dquot pointers in inode is serialized
- *   b) everyone is guarded against invalidate_dquots()
 *
 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
 * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -122,10 +120,17 @@
 * Lock ordering (including related VFS locks) is the following:
 *   i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock >
 *   dqio_mutex
+ * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
+ * dqptr_sem. But filesystem has to count with the fact that functions such as
+ * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
+ * from inside a transaction to keep filesystem consistency after a crash. Also
+ * filesystems usually want to do some IO on dquot from ->mark_dirty which is
+ * called with dqptr_sem held.
 * i_mutex on quota files is special (it's below dqio_mutex)
 */
 static DEFINE_SPINLOCK(dq_list_lock);
+static DEFINE_SPINLOCK(dq_state_lock);
 DEFINE_SPINLOCK(dq_data_lock);
 static char *quotatypes[] = INITQFNAMES;
@@ -211,8 +216,6 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
-static void dqput(struct dquot *dquot);
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
@@ -415,11 +418,22 @@ out_dqlock:
        return ret;
 }
+void dquot_destroy(struct dquot *dquot)
+{
+        kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+        dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
 /* Invalidate all dquots on the list. Note that this function is called after
 * quota is disabled and pointers from inodes removed so there cannot be new
 * quota users. There can still be some users of quotas due to inodes being
 * just deleted or pruned by prune_icache() (those are not attached to any
- * list). We have to wait for such users.
+ * list) or parallel quotactl call. We have to wait for such users.
 */
 static void invalidate_dquots(struct super_block *sb, int type)
 {
@@ -463,11 +477,46 @@ restart:
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
-                kmem_cache_free(dquot_cachep, dquot);
+                do_destroy_dquot(dquot);
        }
        spin_unlock(&dq_list_lock);
 }
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+                      int (*fn)(struct dquot *dquot, unsigned long priv),
+                      unsigned long priv)
+{
+        struct dquot *dquot, *old_dquot = NULL;
+        int ret = 0;
+        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        spin_lock(&dq_list_lock);
+        list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+                if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+                        continue;
+                if (dquot->dq_sb != sb)
+                        continue;
+                /* Now we have active dquot so we can just increase use count */
+                atomic_inc(&dquot->dq_count);
+                dqstats.lookups++;
+                spin_unlock(&dq_list_lock);
+                dqput(old_dquot);
+                old_dquot = dquot;
+                ret = fn(dquot, priv);
+                if (ret < 0)
+                        goto out;
+                spin_lock(&dq_list_lock);
+                /* We are safe to continue now because our dquot could not
+                 * be moved out of the inuse list while we hold the reference */
+        }
+        spin_unlock(&dq_list_lock);
+out:
+        dqput(old_dquot);
+        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+        return ret;
+}
 int vfs_quota_sync(struct super_block *sb, int type)
 {
        struct list_head *dirty;
@@ -479,7 +528,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
-                if (!sb_has_quota_enabled(sb, cnt))
+                if (!sb_has_quota_active(sb, cnt))
                        continue;
                spin_lock(&dq_list_lock);
                dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +553,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
        }
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)
+                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
-                        && info_dirty(&dqopt->info[cnt]))
+                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
        spin_lock(&dq_list_lock);
        dqstats.syncs++;
@@ -527,7 +576,7 @@ static void prune_dqcache(int count)
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
-                kmem_cache_free(dquot_cachep, dquot);
+                do_destroy_dquot(dquot);
                count--;
                head = free_dquots.prev;
        }
@@ -556,9 +605,8 @@ static struct shrinker dqcache_shrinker = {
 /*
 * Put reference to dquot
 * NOTE: If you change this function please check whether dqput_blocks() works right...
- * MUST be called with either dqptr_sem or dqonoff_mutex held
 */
-static void dqput(struct dquot *dquot)
+void dqput(struct dquot *dquot)
 {
        int ret;
@@ -584,7 +632,7 @@ we_slept:
                /* We have more than one user... nothing to do */
                atomic_dec(&dquot->dq_count);
                /* Releasing dquot during quotaoff phase? */
-                if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) &&
+                if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
                    atomic_read(&dquot->dq_count) == 1)
                        wake_up(&dquot->dq_wait_unused);
                spin_unlock(&dq_list_lock);
@@ -625,11 +673,17 @@ we_slept:
        spin_unlock(&dq_list_lock);
 }
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+        return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
 static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 {
        struct dquot *dquot;
-        dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+        dquot = sb->dq_op->alloc_dquot(sb, type);
        if(!dquot)
                return NODQUOT;
@@ -648,17 +702,29 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 /*
 * Get reference to dquot
- * MUST be called with either dqptr_sem or dqonoff_mutex held
+ *
+ * Locking is slightly tricky here. We are guarded from parallel quotaoff()
+ * destroying our dquot by:
+ *   a) checking for quota flags under dq_list_lock and
+ *   b) getting a reference to dquot before we release dq_list_lock
 */
-static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
        unsigned int hashent = hashfn(sb, id, type);
-        struct dquot *dquot, *empty = NODQUOT;
+        struct dquot *dquot = NODQUOT, *empty = NODQUOT;
-        if (!sb_has_quota_enabled(sb, type))
+        if (!sb_has_quota_active(sb, type))
                return NODQUOT;
 we_slept:
        spin_lock(&dq_list_lock);
+        spin_lock(&dq_state_lock);
+        if (!sb_has_quota_active(sb, type)) {
+                spin_unlock(&dq_state_lock);
+                spin_unlock(&dq_list_lock);
+                goto out;
+        }
+        spin_unlock(&dq_state_lock);
        if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) {
                if (empty == NODQUOT) {
                        spin_unlock(&dq_list_lock);
@@ -667,6 +733,7 @@ we_slept:
                        goto we_slept;
                }
                dquot = empty;
+                empty = NODQUOT;
                dquot->dq_id = id;
                /* all dquots go on the inuse_list */
                put_inuse(dquot);
@@ -681,8 +748,6 @@ we_slept:
                dqstats.cache_hits++;
                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
-                if (empty)
-                        kmem_cache_free(dquot_cachep, empty);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is already
         * finished or it will be canceled due to dq_count > 1 test */
@@ -690,11 +755,15 @@ we_slept:
        /* Read the dquot and instantiate it (everything done only if needed) */
        if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) {
                dqput(dquot);
-                return NODQUOT;
+                dquot = NODQUOT;
+                goto out;
        }
 #ifdef __DQUOT_PARANOIA
        BUG_ON(!dquot->dq_sb);  /* Has somebody invalidated entry under us? */
 #endif
+out:
+        if (empty)
+                do_destroy_dquot(empty);
        return dquot;
 }
@@ -820,7 +889,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
        }
 }
-static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
 {
        dquot->dq_dqb.dqb_curinodes += number;
 }
@@ -830,9 +899,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
        dquot->dq_dqb.dqb_curspace += number;
 }
-static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
-        if (dquot->dq_dqb.dqb_curinodes > number)
+        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+            dquot->dq_dqb.dqb_curinodes >= number)
                dquot->dq_dqb.dqb_curinodes -= number;
        else
                dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +913,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
 static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 {
-        if (dquot->dq_dqb.dqb_curspace > number)
+        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+            dquot->dq_dqb.dqb_curspace >= number)
                dquot->dq_dqb.dqb_curspace -= number;
        else
                dquot->dq_dqb.dqb_curspace = 0;
-        if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+        if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
@@ -1023,10 +1094,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
 }
 /* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
        *warntype = QUOTA_NL_NOWARN;
-        if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return QUOTA_OK;
        if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1130,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
        *warntype = QUOTA_NL_NOWARN;
-        if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return QUOTA_OK;
        if (dquot->dq_dqb.dqb_bhardlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit &&
+            dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
                        *warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1143,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        }
        if (dquot->dq_dqb.dqb_bsoftlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
@@ -1079,7 +1152,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        }
        if (dquot->dq_dqb.dqb_bsoftlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (!prealloc) {
                        *warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1169,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        return QUOTA_OK;
 }
-static int info_idq_free(struct dquot *dquot, ulong inodes)
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+            !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
                return QUOTA_NL_NOWARN;
        if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,71 +1187,88 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
 static int info_bdq_free(struct dquot *dquot, qsize_t space)
 {
        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-            toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+            dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_NOWARN;
-        if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
+        if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
-            dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_BSOFTBELOW;
-        if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
+        if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
-            toqb(dquot->dq_dqb.dqb_curspace - space) <
+            dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
-                                                dquot->dq_dqb.dqb_bhardlimit)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
 }
 /*
 *      Initialize quota pointers in inode
- *      Transaction must be started at entry
+ *      We do things in a bit complicated way but by that we avoid calling
+ *      dqget() and thus filesystem callbacks under dqptr_sem.
 */
 int dquot_initialize(struct inode *inode, int type)
 {
        unsigned int id = 0;
        int cnt, ret = 0;
+        struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT };
+        struct super_block *sb = inode->i_sb;
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
        if (IS_NOQUOTA(inode))
                return 0;
-        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        /* First get references to structures we might need. */
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (type != -1 && cnt != type)
+                        continue;
+                switch (cnt) {
+                case USRQUOTA:
+                        id = inode->i_uid;
+                        break;
+                case GRPQUOTA:
+                        id = inode->i_gid;
+                        break;
+                }
+                got[cnt] = dqget(sb, id, cnt);
+        }
+        down_write(&sb_dqopt(sb)->dqptr_sem);
        /* Having dqptr_sem we know NOQUOTA flags can't be altered... */
        if (IS_NOQUOTA(inode))
                goto out_err;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
+                /* Avoid races with quotaoff() */
+                if (!sb_has_quota_active(sb, cnt))
+                        continue;
                if (inode->i_dquot[cnt] == NODQUOT) {
-                        switch (cnt) {
+                        inode->i_dquot[cnt] = got[cnt];
-                                case USRQUOTA:
+                        got[cnt] = NODQUOT;
-                                        id = inode->i_uid;
-                                        break;
-                                case GRPQUOTA:
-                                        id = inode->i_gid;
-                                        break;
-                        }
-                        inode->i_dquot[cnt] = dqget(inode->i_sb, id, cnt);
                }
        }
 out_err:
-        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        up_write(&sb_dqopt(sb)->dqptr_sem);
+        /* Drop unused references */
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                dqput(got[cnt]);
        return ret;
 }
 /*
 *      Release all quotas referenced by inode
- *      Transaction must be started at an entry
 */
 int dquot_drop(struct inode *inode)
 {
        int cnt;
+        struct dquot *put[MAXQUOTAS];
        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (inode->i_dquot[cnt] != NODQUOT) {
+                put[cnt] = inode->i_dquot[cnt];
-                        dqput(inode->i_dquot[cnt]);
+                inode->i_dquot[cnt] = NODQUOT;
-                        inode->i_dquot[cnt] = NODQUOT;
-                }
        }
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                dqput(put[cnt]);
        return 0;
 }
@@ -1264,7 +1355,7 @@ warn_put_all:
 /*
 * This operation can block, but only after everything is updated
 */
-int dquot_alloc_inode(const struct inode *inode, unsigned long number)
+int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 {
        int cnt, ret = NO_QUOTA;
        char warntype[MAXQUOTAS];
@@ -1349,7 +1440,7 @@ out_sub:
 /*
 * This operation can block, but only after everything is updated
 */
-int dquot_free_inode(const struct inode *inode, unsigned long number)
+int dquot_free_inode(const struct inode *inode, qsize_t number)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
@@ -1393,8 +1484,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
        qsize_t space;
        struct dquot *transfer_from[MAXQUOTAS];
        struct dquot *transfer_to[MAXQUOTAS];
-        int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid,
+        int cnt, ret = QUOTA_OK;
-            chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
+        int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
+            chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
        char warntype_to[MAXQUOTAS];
        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1402,21 +1494,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
         * re-enter the quota code and are already holding the mutex */
        if (IS_NOQUOTA(inode))
                return QUOTA_OK;
-        /* Clear the arrays */
+        /* Initialize the arrays */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                transfer_to[cnt] = transfer_from[cnt] = NODQUOT;
+                transfer_from[cnt] = NODQUOT;
+                transfer_to[cnt] = NODQUOT;
                warntype_to[cnt] = QUOTA_NL_NOWARN;
-        }
-        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        /* Now recheck reliably when holding dqptr_sem */
-        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
-                up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-                return QUOTA_OK;
-        }
-        /* First build the transfer_to list - here we can block on
-         * reading/instantiating of dquots.  We know that the transaction for
-         * us was already started so we don't violate lock ranking here */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                switch (cnt) {
                        case USRQUOTA:
                                if (!chuid)
@@ -1430,6 +1512,13 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
                                break;
                }
        }
+        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        /* Now recheck reliably when holding dqptr_sem */
+        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
+                up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+                goto put_all;
+        }
        spin_lock(&dq_data_lock);
        space = inode_get_bytes(inode);
        /* Build the transfer_from list and check the limits */
@@ -1440,7 +1529,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
                if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
                    NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
                    warntype_to + cnt) == NO_QUOTA)
-                        goto warn_put_all;
+                        goto over_quota;
        }
        /*
@@ -1468,34 +1557,43 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
                inode->i_dquot[cnt] = transfer_to[cnt];
        }
-        ret = QUOTA_OK;
-warn_put_all:
        spin_unlock(&dq_data_lock);
+        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        /* Dirtify all the dquots - this can block when journalling */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (transfer_from[cnt])
                        mark_dquot_dirty(transfer_from[cnt]);
-                if (transfer_to[cnt])
+                if (transfer_to[cnt]) {
                        mark_dquot_dirty(transfer_to[cnt]);
+                        /* The reference we got is transferred to the inode */
+                        transfer_to[cnt] = NODQUOT;
+                }
        }
+warn_put_all:
        flush_warnings(transfer_to, warntype_to);
        flush_warnings(transfer_from, warntype_from_inodes);
        flush_warnings(transfer_from, warntype_from_space);
-        
+put_all:
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT)
+                dqput(transfer_from[cnt]);
-                        dqput(transfer_from[cnt]);
+                dqput(transfer_to[cnt]);
-                if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT)
-                        dqput(transfer_to[cnt]);
        }
-        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return ret;
+over_quota:
+        spin_unlock(&dq_data_lock);
+        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        /* Clear dquot pointers we don't want to dqput() */
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                transfer_from[cnt] = NODQUOT;
+        ret = NO_QUOTA;
+        goto warn_put_all;
 }
 /* Wrapper for transferring ownership of an inode */
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 {
-        if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+        if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
                vfs_dq_init(inode);
                if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
                        return 1;
@@ -1533,54 +1631,27 @@ struct dquot_operations dquot_operations = {
        .acquire_dquot  = dquot_acquire,
        .release_dquot  = dquot_release,
        .mark_dirty     = dquot_mark_dquot_dirty,
-        .write_info     = dquot_commit_info
+        .write_info     = dquot_commit_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
-static inline void set_enable_flags(struct quota_info *dqopt, int type)
-{
-        switch (type) {
-                case USRQUOTA:
-                        dqopt->flags |= DQUOT_USR_ENABLED;
-                        dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-                        break;
-                case GRPQUOTA:
-                        dqopt->flags |= DQUOT_GRP_ENABLED;
-                        dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-                        break;
-        }
-}
-static inline void reset_enable_flags(struct quota_info *dqopt, int type,
-                                      int remount)
-{
-        switch (type) {
-                case USRQUOTA:
-                        dqopt->flags &= ~DQUOT_USR_ENABLED;
-                        if (remount)
-                                dqopt->flags |= DQUOT_USR_SUSPENDED;
-                        else
-                                dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-                        break;
-                case GRPQUOTA:
-                        dqopt->flags &= ~DQUOT_GRP_ENABLED;
-                        if (remount)
-                                dqopt->flags |= DQUOT_GRP_SUSPENDED;
-                        else
-                                dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-                        break;
-        }
-}
 /*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 {
        int cnt, ret = 0;
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *toputinode[MAXQUOTAS];
+        /* Cannot turn off usage accounting without turning off limits, or
+         * suspend quotas and simultaneously turn quotas off. */
+        if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+            || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+            DQUOT_USAGE_ENABLED)))
+                return -EINVAL;
        /* We need to serialize quota_off() for device */
        mutex_lock(&dqopt->dqonoff_mutex);
@@ -1589,7 +1660,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
         * sometimes we are called when fill_super() failed and calling
         * sync_fs() in such cases does no good.
         */
-        if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+        if (!sb_any_quota_loaded(sb)) {
                mutex_unlock(&dqopt->dqonoff_mutex);
                return 0;
        }
@@ -1597,17 +1668,33 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                toputinode[cnt] = NULL;
                if (type != -1 && cnt != type)
                        continue;
-                /* If we keep inodes of quota files after remount and quotaoff
+                if (!sb_has_quota_loaded(sb, cnt))
-                 * is called, drop kept inodes. */
-                if (!remount && sb_has_quota_suspended(sb, cnt)) {
-                        iput(dqopt->files[cnt]);
-                        dqopt->files[cnt] = NULL;
-                        reset_enable_flags(dqopt, cnt, 0);
                        continue;
+                if (flags & DQUOT_SUSPENDED) {
+                        spin_lock(&dq_state_lock);
+                        dqopt->flags |=
+                                dquot_state_flag(DQUOT_SUSPENDED, cnt);
+                        spin_unlock(&dq_state_lock);
+                } else {
+                        spin_lock(&dq_state_lock);
+                        dqopt->flags &= ~dquot_state_flag(flags, cnt);
+                        /* Turning off suspended quotas? */
+                        if (!sb_has_quota_loaded(sb, cnt) &&
+                            sb_has_quota_suspended(sb, cnt)) {
+                                dqopt->flags &= ~dquot_state_flag(
+                                                        DQUOT_SUSPENDED, cnt);
+                                spin_unlock(&dq_state_lock);
+                                iput(dqopt->files[cnt]);
+                                dqopt->files[cnt] = NULL;
+                                continue;
+                        }
+                        spin_unlock(&dq_state_lock);
                }
-                if (!sb_has_quota_enabled(sb, cnt))
+                /* We still have to keep quota loaded? */
+                if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
                        continue;
-                reset_enable_flags(dqopt, cnt, remount);
                /* Note: these are blocking operations */
                drop_dquot_ref(sb, cnt);
@@ -1623,7 +1710,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                put_quota_format(dqopt->info[cnt].dqi_format);
                toputinode[cnt] = dqopt->files[cnt];
-                if (!remount)
+                if (!sb_has_quota_loaded(sb, cnt))
                        dqopt->files[cnt] = NULL;
                dqopt->info[cnt].dqi_flags = 0;
                dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1718,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                dqopt->ops[cnt] = NULL;
        }
        mutex_unlock(&dqopt->dqonoff_mutex);
+        /* Skip syncing and setting flags if quota files are hidden */
+        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+                goto put_inodes;
        /* Sync the superblock so that buffers with quota data are written to
         * disk (and so userspace sees correct data afterwards). */
        if (sb->s_op->sync_fs)
@@ -1646,7 +1738,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                        mutex_lock(&dqopt->dqonoff_mutex);
                        /* If quota was reenabled in the meantime, we have
                         * nothing to do */
-                        if (!sb_has_quota_enabled(sb, cnt)) {
+                        if (!sb_has_quota_loaded(sb, cnt)) {
                                mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
                                toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
                                  S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1747,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                                mark_inode_dirty(toputinode[cnt]);
                        }
                        mutex_unlock(&dqopt->dqonoff_mutex);
+                }
+        if (sb->s_bdev)
+                invalidate_bdev(sb->s_bdev);
+put_inodes:
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (toputinode[cnt]) {
                        /* On remount RO, we keep the inode pointer so that we
-                         * can reenable quota on the subsequent remount RW.
+                         * can reenable quota on the subsequent remount RW. We
-                         * But we have better not keep inode pointer when there
+                         * have to check 'flags' variable and not use sb_has_
-                         * is pending delete on the quota file... */
+                         * function because another quotaon / quotaoff could
-                        if (!remount)
+                         * change global state before we got here. We refuse
+                         * to suspend quotas when there is pending delete on
+                         * the quota file... */
+                        if (!(flags & DQUOT_SUSPENDED))
                                iput(toputinode[cnt]);
                        else if (!toputinode[cnt]->i_nlink)
                                ret = -EBUSY;
                }
-        if (sb->s_bdev)
-                invalidate_bdev(sb->s_bdev);
        return ret;
 }
+int vfs_quota_off(struct super_block *sb, int type, int remount)
+{
+        return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+                                 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+}
 /*
 *      Turn quotas on on a device
 */
-/* Helper function when we already have the inode */
+/*
-static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+        unsigned int flags)
 {
        struct quota_format_type *fmt = find_quota_format(format_id);
        struct super_block *sb = inode->i_sb;
@@ -1696,27 +1805,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
                error = -EINVAL;
                goto out_fmt;
        }
+        /* Usage always has to be set... */
+        if (!(flags & DQUOT_USAGE_ENABLED)) {
+                error = -EINVAL;
+                goto out_fmt;
+        }
-        /* As we bypass the pagecache we must now flush the inode so that
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
-         * we see all the changes from userspace... */
+                /* As we bypass the pagecache we must now flush the inode so
-        write_inode_now(inode, 1);
+                 * that we see all the changes from userspace... */
-        /* And now flush the block cache so that kernel sees the changes */
+                write_inode_now(inode, 1);
-        invalidate_bdev(sb->s_bdev);
+                /* And now flush the block cache so that kernel sees the
+                 * changes */
+                invalidate_bdev(sb->s_bdev);
+        }
        mutex_lock(&inode->i_mutex);
        mutex_lock(&dqopt->dqonoff_mutex);
-        if (sb_has_quota_enabled(sb, type) ||
+        if (sb_has_quota_loaded(sb, type)) {
-                        sb_has_quota_suspended(sb, type)) {
                error = -EBUSY;
                goto out_lock;
        }
-        /* We don't want quota and atime on quota files (deadlocks possible)
-         * Also nobody should write to the file - we use special IO operations
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
-         * which ignore the immutable bit. */
+                /* We don't want quota and atime on quota files (deadlocks
-        down_write(&dqopt->dqptr_sem);
+                 * possible) Also nobody should write to the file - we use
-        oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+                 * special IO operations which ignore the immutable bit. */
-        inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                down_write(&dqopt->dqptr_sem);
-        up_write(&dqopt->dqptr_sem);
+                oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
-        sb->dq_op->drop(inode);
+                inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                up_write(&dqopt->dqptr_sem);
+                sb->dq_op->drop(inode);
+        }
        error = -EIO;
        dqopt->files[type] = igrab(inode);
@@ -1737,7 +1856,9 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
        }
        mutex_unlock(&dqopt->dqio_mutex);
        mutex_unlock(&inode->i_mutex);
-        set_enable_flags(dqopt, type);
+        spin_lock(&dq_state_lock);
+        dqopt->flags |= dquot_state_flag(flags, type);
+        spin_unlock(&dq_state_lock);
        add_dquot_ref(sb, type);
        mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1891,25 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode;
        int ret;
+        unsigned int flags;
        mutex_lock(&dqopt->dqonoff_mutex);
        if (!sb_has_quota_suspended(sb, type)) {
                mutex_unlock(&dqopt->dqonoff_mutex);
                return 0;
        }
-        BUG_ON(sb_has_quota_enabled(sb, type));
        inode = dqopt->files[type];
        dqopt->files[type] = NULL;
-        reset_enable_flags(dqopt, type, 0);
+        spin_lock(&dq_state_lock);
+        flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+                                                DQUOT_LIMITS_ENABLED, type);
+        dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
+        spin_unlock(&dq_state_lock);
        mutex_unlock(&dqopt->dqonoff_mutex);
-        ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id);
+        flags = dquot_generic_flag(flags, type);
+        ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+                                   flags);
        iput(inode);
        return ret;
@@ -1799,12 +1925,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
        if (path->mnt->mnt_sb != sb)
                error = -EXDEV;
        else
-                error = vfs_quota_on_inode(path->dentry->d_inode, type,
+                error = vfs_load_quota_inode(path->dentry->d_inode, type,
-                                           format_id);
+                                             format_id, DQUOT_USAGE_ENABLED |
+                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-/* Actual function called from quotactl() */
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
                 int remount)
 {
@@ -1823,6 +1949,52 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 }
 /*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+                unsigned int flags)
+{
+        int ret = 0;
+        struct super_block *sb = inode->i_sb;
+        struct quota_info *dqopt = sb_dqopt(sb);
+        /* Just unsuspend quotas? */
+        if (flags & DQUOT_SUSPENDED)
+                return vfs_quota_on_remount(sb, type);
+        if (!flags)
+                return 0;
+        /* Just updating flags needed? */
+        if (sb_has_quota_loaded(sb, type)) {
+                mutex_lock(&dqopt->dqonoff_mutex);
+                /* Now do a reliable test... */
+                if (!sb_has_quota_loaded(sb, type)) {
+                        mutex_unlock(&dqopt->dqonoff_mutex);
+                        goto load_quota;
+                }
+                if (flags & DQUOT_USAGE_ENABLED &&
+                    sb_has_quota_usage_enabled(sb, type)) {
+                        ret = -EBUSY;
+                        goto out_lock;
+                }
+                if (flags & DQUOT_LIMITS_ENABLED &&
+                    sb_has_quota_limits_enabled(sb, type)) {
+                        ret = -EBUSY;
+                        goto out_lock;
+                }
+                spin_lock(&dq_state_lock);
+                sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+                spin_unlock(&dq_state_lock);
+out_lock:
+                mutex_unlock(&dqopt->dqonoff_mutex);
+                return ret;
+        }
+load_quota:
+        return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+/*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
@@ -1843,7 +2015,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
        error = security_quota_on(dentry);
        if (!error)
-                error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+                error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+                                DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 out:
        dput(dentry);
@@ -1866,14 +2039,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
        return ret;
 }
+static inline qsize_t qbtos(qsize_t blocks)
+{
+        return blocks << QIF_DQBLKSIZE_BITS;
+}
+static inline qsize_t stoqb(qsize_t space)
+{
+        return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        spin_lock(&dq_data_lock);
-        di->dqb_bhardlimit = dm->dqb_bhardlimit;
+        di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
-        di->dqb_bsoftlimit = dm->dqb_bsoftlimit;
+        di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
        di->dqb_curspace = dm->dqb_curspace;
        di->dqb_ihardlimit = dm->dqb_ihardlimit;
        di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1888,14 +2071,12 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 {
        struct dquot *dquot;
-        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        dquot = dqget(sb, id, type);
-        if (!(dquot = dqget(sb, id, type))) {
+        if (dquot == NODQUOT)
-                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
                return -ESRCH;
-        }
        do_get_dqblk(dquot, di);
        dqput(dquot);
-        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return 0;
 }
@@ -1918,28 +2099,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
        if (di->dqb_valid & QIF_SPACE) {
                dm->dqb_curspace = di->dqb_curspace;
                check_blim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_BLIMITS) {
-                dm->dqb_bsoftlimit = di->dqb_bsoftlimit;
+                dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
-                dm->dqb_bhardlimit = di->dqb_bhardlimit;
+                dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
                check_blim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_INODES) {
                dm->dqb_curinodes = di->dqb_curinodes;
                check_ilim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_ILIMITS) {
                dm->dqb_isoftlimit = di->dqb_isoftlimit;
                dm->dqb_ihardlimit = di->dqb_ihardlimit;
                check_ilim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_BTIME)
+        if (di->dqb_valid & QIF_BTIME) {
                dm->dqb_btime = di->dqb_btime;
-        if (di->dqb_valid & QIF_ITIME)
+                check_blim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+        }
+        if (di->dqb_valid & QIF_ITIME) {
                dm->dqb_itime = di->dqb_itime;
+                check_ilim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+        }
        if (check_blim) {
-                if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) {
+                if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                }
@@ -1969,14 +2160,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
        struct dquot *dquot;
        int rc;
-        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        dquot = dqget(sb, id, type);
-        if (!(dquot = dqget(sb, id, type))) {
+        if (!dquot) {
-                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+                rc = -ESRCH;
-                return -ESRCH;
+                goto out;
        }
        rc = do_set_dqblk(dquot, di);
        dqput(dquot);
-        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+out:
        return rc;
 }
@@ -1986,7 +2177,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        struct mem_dqinfo *mi;
  
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-        if (!sb_has_quota_enabled(sb, type)) {
+        if (!sb_has_quota_active(sb, type)) {
                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
                return -ESRCH;
        }
@@ -2005,11 +2196,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
+        int err = 0;
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-        if (!sb_has_quota_enabled(sb, type)) {
+        if (!sb_has_quota_active(sb, type)) {
-                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+                err = -ESRCH;
-                return -ESRCH;
+                goto out;
        }
        mi = sb_dqopt(sb)->info + type;
        spin_lock(&dq_data_lock);
@@ -2023,8 +2215,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        mark_info_dirty(sb, type);
        /* Force write to disk */
        sb->dq_op->write_info(sb, type);
+out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-        return 0;
+        return err;
 }
 struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2379,13 @@ EXPORT_SYMBOL(register_quota_format);
 EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(vfs_quota_enable);
 EXPORT_SYMBOL(vfs_quota_on);
 EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(vfs_quota_disable);
 EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_scan_active);
 EXPORT_SYMBOL(vfs_quota_sync);
 EXPORT_SYMBOL(vfs_get_dqinfo);
 EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2203,6 +2399,8 @@ EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
 EXPORT_SYMBOL(vfs_dq_drop);
+EXPORT_SYMBOL(dqget);
+EXPORT_SYMBOL(dqput);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
new file mode 100644
index 000000000000..0c754e64232b
--- /dev/null
+++ b/fs/ecryptfs/Kconfig
@@ -0,0 +1,11 @@
+config ECRYPT_FS
+        tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
+        depends on EXPERIMENTAL && KEYS && CRYPTO && NET
+        help
+          Encrypted filesystem that operates on the VFS layer.  See
+          <file:Documentation/filesystems/ecryptfs.txt> to learn more about
+          eCryptfs.  Userspace components are required and can be
+          obtained from <http://ecryptfs.sf.net>.
+          To compile this file system support as a module, choose M here: the
+          module will be called ecryptfs.
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 6046239465a1..c01e043670e2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -175,8 +175,8 @@ out:
 *
 * Returns zero on success; non-zero on error.
 */
-static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
-                              loff_t offset)
+                       loff_t offset)
 {
        int rc = 0;
        char dst[MD5_DIGEST_SIZE];
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
                crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+                crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
+                if (mount_crypt_stat->flags
+                    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
+                        crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
+                else if (mount_crypt_stat->flags
+                         & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
+                        crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
+        }
 }
 static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem {
 static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
        {0x00000001, ECRYPTFS_ENABLE_HMAC},
        {0x00000002, ECRYPTFS_ENCRYPTED},
-        {0x00000004, ECRYPTFS_METADATA_IN_XATTR}
+        {0x00000004, ECRYPTFS_METADATA_IN_XATTR},
+        {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
 };
 /**
@@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = {
 /**
 * ecryptfs_code_for_cipher_string
- * @crypt_stat: The cryptographic context
+ * @cipher_name: The string alias for the cipher
+ * @key_bytes: Length of key in bytes; used for AES code selection
 *
 * Returns zero on no match, or the cipher code on match
 */
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
 {
        int i;
        u8 code = 0;
        struct ecryptfs_cipher_code_str_map_elem *map =
                ecryptfs_cipher_code_str_map;
-        if (strcmp(crypt_stat->cipher, "aes") == 0) {
+        if (strcmp(cipher_name, "aes") == 0) {
-                switch (crypt_stat->key_size) {
+                switch (key_bytes) {
                case 16:
                        code = RFC2440_CIPHER_AES_128;
                        break;
@@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
                }
        } else {
                for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
-                        if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
+                        if (strcmp(cipher_name, map[i].cipher_str) == 0) {
                                code = map[i].cipher_code;
                                break;
                        }
@@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data,
                &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
        int rc;
+        if (crypt_stat->extent_size == 0)
+                crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
        rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
                                 ecryptfs_inode);
        if (rc) {
@@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data,
        }
        if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
                rc = -EINVAL;
-                ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n");
        }
 out:
        return rc;
@@ -1628,95 +1640,95 @@ out:
 }
 /**
- * ecryptfs_encode_filename - converts a plaintext file name to cipher text
+ * ecryptfs_encrypt_filename - encrypt filename
- * @crypt_stat: The crypt_stat struct associated with the file anem to encode
- * @name: The plaintext name
- * @length: The length of the plaintext
- * @encoded_name: The encypted name
 *
- * Encrypts and encodes a filename into something that constitutes a
+ * CBC-encrypts the filename. We do not want to encrypt the same
- * valid filename for a filesystem, with printable characters.
+ * filename with the same key and IV, which may happen with hard
+ * links, so we prepend random bits to each filename.
 *
- * We assume that we have a properly initialized crypto context,
+ * Returns zero on success; non-zero otherwise
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of encoded filename; negative if error
 */
-int
+static int
-ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
-                         const char *name, int length, char **encoded_name)
+                          struct ecryptfs_crypt_stat *crypt_stat,
+                          struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
 {
-        int error = 0;
+        int rc = 0;
-        (*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
+        filename->encrypted_filename = NULL;
-        if (!(*encoded_name)) {
+        filename->encrypted_filename_size = 0;
-                error = -ENOMEM;
+        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+                size_t packet_size;
+                size_t remaining_bytes;
+                rc = ecryptfs_write_tag_70_packet(
+                        NULL, NULL,
+                        &filename->encrypted_filename_size,
+                        mount_crypt_stat, NULL,
+                        filename->filename_size);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to get packet "
+                               "size for tag 72; rc = [%d]\n", __func__,
+                               rc);
+                        filename->encrypted_filename_size = 0;
+                        goto out;
+                }
+                filename->encrypted_filename =
+                        kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
+                if (!filename->encrypted_filename) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kmalloc [%zd] bytes\n", __func__,
+                               filename->encrypted_filename_size);
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                remaining_bytes = filename->encrypted_filename_size;
+                rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
+                                                  &remaining_bytes,
+                                                  &packet_size,
+                                                  mount_crypt_stat,
+                                                  filename->filename,
+                                                  filename->filename_size);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to generate "
+                               "tag 70 packet; rc = [%d]\n", __func__,
+                               rc);
+                        kfree(filename->encrypted_filename);
+                        filename->encrypted_filename = NULL;
+                        filename->encrypted_filename_size = 0;
+                        goto out;
+                }
+                filename->encrypted_filename_size = packet_size;
+        } else {
+                printk(KERN_ERR "%s: No support for requested filename "
+                       "encryption method in this release\n", __func__);
+                rc = -ENOTSUPP;
                goto out;
        }
-        /* TODO: Filename encryption is a scheduled feature for a
-         * future version of eCryptfs. This function is here only for
-         * the purpose of providing a framework for other developers
-         * to easily implement filename encryption. Hint: Replace this
-         * memcpy() with a call to encrypt and encode the
-         * filename, the set the length accordingly. */
-        memcpy((void *)(*encoded_name), (void *)name, length);
-        (*encoded_name)[length] = '\0';
-        error = length + 1;
 out:
-        return error;
+        return rc;
 }
-/**
+static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
- * ecryptfs_decode_filename - converts the cipher text name to plaintext
+                                  const char *name, size_t name_size)
- * @crypt_stat: The crypt_stat struct associated with the file
- * @name: The filename in cipher text
- * @length: The length of the cipher text name
- * @decrypted_name: The plaintext name
- *
- * Decodes and decrypts the filename.
- *
- * We assume that we have a properly initialized crypto context,
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of decoded filename; negative if error
- */
-int
-ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-                         const char *name, int length, char **decrypted_name)
 {
-        int error = 0;
+        int rc = 0;
-        (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
+        (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
-        if (!(*decrypted_name)) {
+        if (!(*copied_name)) {
-                error = -ENOMEM;
+                rc = -ENOMEM;
                goto out;
        }
-        /* TODO: Filename encryption is a scheduled feature for a
+        memcpy((void *)(*copied_name), (void *)name, name_size);
-         * future version of eCryptfs. This function is here only for
+        (*copied_name)[(name_size)] = '\0';     /* Only for convenience
-         * the purpose of providing a framework for other developers
-         * to easily implement filename encryption. Hint: Replace this
-         * memcpy() with a call to decode and decrypt the
-         * filename, the set the length accordingly. */
-        memcpy((void *)(*decrypted_name), (void *)name, length);
-        (*decrypted_name)[length + 1] = '\0';   /* Only for convenience
                                                 * in printing out the
                                                 * string in debug
                                                 * messages */
-        error = length;
+        (*copied_name_size) = (name_size + 1);
 out:
-        return error;
+        return rc;
 }
 /**
@@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
        *key_tfm = NULL;
        if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
                rc = -EINVAL;
-                printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
+                printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
                      "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
                goto out;
        }
@@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
        get_random_bytes(dummy_key, *key_size);
        rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
        if (rc) {
-                printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+                printk(KERN_ERR "Error attempting to set key of size [%zd] for "
                       "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
                rc = -EINVAL;
                goto out;
@@ -1910,3 +1922,341 @@ out:
        mutex_unlock(&key_tfm_list_mutex);
        return rc;
 }
+/* 64 characters forming a 6-bit target field */
+static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
+                                                 "EFGHIJKLMNOPQRST"
+                                                 "UVWXYZabcdefghij"
+                                                 "klmnopqrstuvwxyz");
+/* We could either offset on every reverse map or just pad some 0x00's
+ * at the front here */
+static const unsigned char filename_rev_map[] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
+        0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
+        0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
+        0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
+        0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
+        0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
+        0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
+        0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
+        0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
+        0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
+        0x3D, 0x3E, 0x3F
+};
+/**
+ * ecryptfs_encode_for_filename
+ * @dst: Destination location for encoded filename
+ * @dst_size: Size of the encoded filename in bytes
+ * @src: Source location for the filename to encode
+ * @src_size: Size of the source in bytes
+ */
+void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
+                                  unsigned char *src, size_t src_size)
+{
+        size_t num_blocks;
+        size_t block_num = 0;
+        size_t dst_offset = 0;
+        unsigned char last_block[3];
+        if (src_size == 0) {
+                (*dst_size) = 0;
+                goto out;
+        }
+        num_blocks = (src_size / 3);
+        if ((src_size % 3) == 0) {
+                memcpy(last_block, (&src[src_size - 3]), 3);
+        } else {
+                num_blocks++;
+                last_block[2] = 0x00;
+                switch (src_size % 3) {
+                case 1:
+                        last_block[0] = src[src_size - 1];
+                        last_block[1] = 0x00;
+                        break;
+                case 2:
+                        last_block[0] = src[src_size - 2];
+                        last_block[1] = src[src_size - 1];
+                }
+        }
+        (*dst_size) = (num_blocks * 4);
+        if (!dst)
+                goto out;
+        while (block_num < num_blocks) {
+                unsigned char *src_block;
+                unsigned char dst_block[4];
+                if (block_num == (num_blocks - 1))
+                        src_block = last_block;
+                else
+                        src_block = &src[block_num * 3];
+                dst_block[0] = ((src_block[0] >> 2) & 0x3F);
+                dst_block[1] = (((src_block[0] << 4) & 0x30)
+                                | ((src_block[1] >> 4) & 0x0F));
+                dst_block[2] = (((src_block[1] << 2) & 0x3C)
+                                | ((src_block[2] >> 6) & 0x03));
+                dst_block[3] = (src_block[2] & 0x3F);
+                dst[dst_offset++] = portable_filename_chars[dst_block[0]];
+                dst[dst_offset++] = portable_filename_chars[dst_block[1]];
+                dst[dst_offset++] = portable_filename_chars[dst_block[2]];
+                dst[dst_offset++] = portable_filename_chars[dst_block[3]];
+                block_num++;
+        }
+out:
+        return;
+}
+/**
+ * ecryptfs_decode_from_filename
+ * @dst: If NULL, this function only sets @dst_size and returns. If
+ *       non-NULL, this function decodes the encoded octets in @src
+ *       into the memory that @dst points to.
+ * @dst_size: Set to the size of the decoded string.
+ * @src: The encoded set of octets to decode.
+ * @src_size: The size of the encoded set of octets to decode.
+ */
+static void
+ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
+                              const unsigned char *src, size_t src_size)
+{
+        u8 current_bit_offset = 0;
+        size_t src_byte_offset = 0;
+        size_t dst_byte_offset = 0;
+        if (dst == NULL) {
+                /* Not exact; conservatively long. Every block of 4
+                 * encoded characters decodes into a block of 3
+                 * decoded characters. This segment of code provides
+                 * the caller with the maximum amount of allocated
+                 * space that @dst will need to point to in a
+                 * subsequent call. */
+                (*dst_size) = (((src_size + 1) * 3) / 4);
+                goto out;
+        }
+        while (src_byte_offset < src_size) {
+                unsigned char src_byte =
+                                filename_rev_map[(int)src[src_byte_offset]];
+                switch (current_bit_offset) {
+                case 0:
+                        dst[dst_byte_offset] = (src_byte << 2);
+                        current_bit_offset = 6;
+                        break;
+                case 6:
+                        dst[dst_byte_offset++] |= (src_byte >> 4);
+                        dst[dst_byte_offset] = ((src_byte & 0xF)
+                                                 << 4);
+                        current_bit_offset = 4;
+                        break;
+                case 4:
+                        dst[dst_byte_offset++] |= (src_byte >> 2);
+                        dst[dst_byte_offset] = (src_byte << 6);
+                        current_bit_offset = 2;
+                        break;
+                case 2:
+                        dst[dst_byte_offset++] |= (src_byte);
+                        dst[dst_byte_offset] = 0;
+                        current_bit_offset = 0;
+                        break;
+                }
+                src_byte_offset++;
+        }
+        (*dst_size) = dst_byte_offset;
+out:
+        return;
+}
+/**
+ * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * Returns zero on success; non-zero on otherwise
+ */
+int ecryptfs_encrypt_and_encode_filename(
+        char **encoded_name,
+        size_t *encoded_name_size,
+        struct ecryptfs_crypt_stat *crypt_stat,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+        const char *name, size_t name_size)
+{
+        size_t encoded_name_no_prefix_size;
+        int rc = 0;
+        (*encoded_name) = NULL;
+        (*encoded_name_size) = 0;
+        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+                struct ecryptfs_filename *filename;
+                filename = kzalloc(sizeof(*filename), GFP_KERNEL);
+                if (!filename) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kzalloc [%zd] bytes\n", __func__,
+                               sizeof(*filename));
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                filename->filename = (char *)name;
+                filename->filename_size = name_size;
+                rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+                                               mount_crypt_stat);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to encrypt "
+                               "filename; rc = [%d]\n", __func__, rc);
+                        kfree(filename);
+                        goto out;
+                }
+                ecryptfs_encode_for_filename(
+                        NULL, &encoded_name_no_prefix_size,
+                        filename->encrypted_filename,
+                        filename->encrypted_filename_size);
+                if ((crypt_stat && (crypt_stat->flags
+                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+                    || (mount_crypt_stat
+                        && (mount_crypt_stat->flags
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+                        (*encoded_name_size) =
+                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+                                 + encoded_name_no_prefix_size);
+                else
+                        (*encoded_name_size) =
+                                (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+                                 + encoded_name_no_prefix_size);
+                (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
+                if (!(*encoded_name)) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kzalloc [%zd] bytes\n", __func__,
+                               (*encoded_name_size));
+                        rc = -ENOMEM;
+                        kfree(filename->encrypted_filename);
+                        kfree(filename);
+                        goto out;
+                }
+                if ((crypt_stat && (crypt_stat->flags
+                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+                    || (mount_crypt_stat
+                        && (mount_crypt_stat->flags
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+                        memcpy((*encoded_name),
+                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
+                        ecryptfs_encode_for_filename(
+                            ((*encoded_name)
+                             + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
+                            &encoded_name_no_prefix_size,
+                            filename->encrypted_filename,
+                            filename->encrypted_filename_size);
+                        (*encoded_name_size) =
+                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+                                 + encoded_name_no_prefix_size);
+                        (*encoded_name)[(*encoded_name_size)] = '\0';
+                        (*encoded_name_size)++;
+                } else {
+                        rc = -ENOTSUPP;
+                }
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to encode "
+                               "encrypted filename; rc = [%d]\n", __func__,
+                               rc);
+                        kfree((*encoded_name));
+                        (*encoded_name) = NULL;
+                        (*encoded_name_size) = 0;
+                }
+                kfree(filename->encrypted_filename);
+                kfree(filename);
+        } else {
+                rc = ecryptfs_copy_filename(encoded_name,
+                                            encoded_name_size,
+                                            name, name_size);
+        }
+out:
+        return rc;
+}
+/**
+ * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
+ * @plaintext_name: The plaintext name
+ * @plaintext_name_size: The plaintext name size
+ * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @name: The filename in cipher text
+ * @name_size: The cipher text name size
+ *
+ * Decrypts and decodes the filename.
+ *
+ * Returns zero on error; non-zero otherwise
+ */
+int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
+                                         size_t *plaintext_name_size,
+                                         struct dentry *ecryptfs_dir_dentry,
+                                         const char *name, size_t name_size)
+{
+        char *decoded_name;
+        size_t decoded_name_size;
+        size_t packet_size;
+        int rc = 0;
+        if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+            && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+                        ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
+                struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+                        &ecryptfs_superblock_to_private(
+                                ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
+                const char *orig_name = name;
+                size_t orig_name_size = name_size;
+                name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+                name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+                ecryptfs_decode_from_filename(NULL, &decoded_name_size,
+                                              name, name_size);
+                decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
+                if (!decoded_name) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kmalloc [%zd] bytes\n", __func__,
+                               decoded_name_size);
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
+                                              name, name_size);
+                rc = ecryptfs_parse_tag_70_packet(plaintext_name,
+                                                  plaintext_name_size,
+                                                  &packet_size,
+                                                  mount_crypt_stat,
+                                                  decoded_name,
+                                                  decoded_name_size);
+                if (rc) {
+                        printk(KERN_INFO "%s: Could not parse tag 70 packet "
+                               "from filename; copying through filename "
+                               "as-is\n", __func__);
+                        rc = ecryptfs_copy_filename(plaintext_name,
+                                                    plaintext_name_size,
+                                                    orig_name, orig_name_size);
+                        goto out_free;
+                }
+        } else {
+                rc = ecryptfs_copy_filename(plaintext_name,
+                                            plaintext_name_size,
+                                            name, name_size);
+                goto out;
+        }
+out_free:
+        kfree(decoded_name);
+out:
+        return rc;
+}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a75026d35d16..c11fc95714ab 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -51,12 +51,16 @@
 #define ECRYPTFS_VERSIONING_XATTR                 0x00000010
 #define ECRYPTFS_VERSIONING_MULTKEY               0x00000020
 #define ECRYPTFS_VERSIONING_DEVMISC               0x00000040
+#define ECRYPTFS_VERSIONING_HMAC                  0x00000080
+#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION   0x00000100
+#define ECRYPTFS_VERSIONING_GCM                   0x00000200
 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
                                  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
                                  | ECRYPTFS_VERSIONING_PUBKEY \
                                  | ECRYPTFS_VERSIONING_XATTR \
                                  | ECRYPTFS_VERSIONING_MULTKEY \
-                                  | ECRYPTFS_VERSIONING_DEVMISC)
+                                  | ECRYPTFS_VERSIONING_DEVMISC \
+                                  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
 #define ECRYPTFS_MAX_PASSWORD_LENGTH 64
 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
 #define ECRYPTFS_SALT_SIZE 8
@@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_DEFAULT_CIPHER "aes"
 #define ECRYPTFS_DEFAULT_KEY_BYTES 16
 #define ECRYPTFS_DEFAULT_HASH "md5"
+#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
 #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
 #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
 #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
 #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
+#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
+                                          * as dentry name */
+#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
+                                          * metadata */
+#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
+                                          * dentry name */
+#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
+                                          * metadata */
+/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
+ * ECRYPTFS_MAX_IV_BYTES */
+#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
+#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
 #define MD5_DIGEST_SIZE 16
+#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
+#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
 struct ecryptfs_key_sig {
        struct list_head crypt_stat_list;
        char keysig[ECRYPTFS_SIG_SIZE_HEX];
 };
+struct ecryptfs_filename {
+        struct list_head crypt_stat_list;
+#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
+        u32 flags;
+        u32 seq_no;
+        char *filename;
+        char *encrypted_filename;
+        size_t filename_size;
+        size_t encrypted_filename_size;
+        char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
+        char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
+};
 /**
 * This is the primary struct associated with each encrypted file.
 *
 * TODO: cache align/pack?
 */
 struct ecryptfs_crypt_stat {
-#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
+#define ECRYPTFS_STRUCT_INITIALIZED   0x00000001
-#define ECRYPTFS_POLICY_APPLIED     0x00000002
+#define ECRYPTFS_POLICY_APPLIED       0x00000002
-#define ECRYPTFS_NEW_FILE           0x00000004
+#define ECRYPTFS_NEW_FILE             0x00000004
-#define ECRYPTFS_ENCRYPTED          0x00000008
+#define ECRYPTFS_ENCRYPTED            0x00000008
-#define ECRYPTFS_SECURITY_WARNING   0x00000010
+#define ECRYPTFS_SECURITY_WARNING     0x00000010
-#define ECRYPTFS_ENABLE_HMAC        0x00000020
+#define ECRYPTFS_ENABLE_HMAC          0x00000020
-#define ECRYPTFS_ENCRYPT_IV_PAGES   0x00000040
+#define ECRYPTFS_ENCRYPT_IV_PAGES     0x00000040
-#define ECRYPTFS_KEY_VALID          0x00000080
+#define ECRYPTFS_KEY_VALID            0x00000080
-#define ECRYPTFS_METADATA_IN_XATTR  0x00000100
+#define ECRYPTFS_METADATA_IN_XATTR    0x00000100
-#define ECRYPTFS_VIEW_AS_ENCRYPTED  0x00000200
+#define ECRYPTFS_VIEW_AS_ENCRYPTED    0x00000200
-#define ECRYPTFS_KEY_SET            0x00000400
+#define ECRYPTFS_KEY_SET              0x00000400
+#define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
+#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
+#define ECRYPTFS_ENCFN_USE_FEK        0x00002000
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
@@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_XATTR_METADATA_ENABLED        0x00000002
 #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED        0x00000004
 #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED  0x00000008
+#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
+#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
+#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
        u32 flags;
        struct list_head global_auth_tok_list;
        struct mutex global_auth_tok_list_mutex;
        size_t num_global_auth_toks;
        size_t global_default_cipher_key_size;
+        size_t global_default_fn_cipher_key_bytes;
        unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
                                                 + 1];
+        unsigned char global_default_fn_cipher_name[
+                ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+        char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
 };
 /* superblock private data. */
@@ -571,13 +617,22 @@ struct ecryptfs_open_req {
 int ecryptfs_interpose(struct dentry *hidden_dentry,
                       struct dentry *this_dentry, struct super_block *sb,
                       u32 flags);
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+                                        struct dentry *lower_dentry,
+                                        struct ecryptfs_crypt_stat *crypt_stat,
+                                        struct inode *ecryptfs_dir_inode,
+                                        struct nameidata *ecryptfs_nd);
+int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
+                                         size_t *decrypted_name_size,
+                                         struct dentry *ecryptfs_dentry,
+                                         const char *name, size_t name_size);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
-int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+int ecryptfs_encrypt_and_encode_filename(
-                             const char *name, int length,
+        char **encoded_name,
-                             char **decrypted_name);
+        size_t *encoded_name_size,
-int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+        struct ecryptfs_crypt_stat *crypt_stat,
-                             const char *name, int length,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
-                             char **encoded_name);
+        const char *name, size_t name_size);
 struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
 void ecryptfs_dump_hex(char *data, int bytes);
 int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
@@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
                                             struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
                                            struct dentry *ecryptfs_dentry);
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
 int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
 void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
 int ecryptfs_generate_key_packet_set(char *dest_base,
@@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file,
                             struct vfsmount *lower_mnt,
                             const struct cred *cred);
 int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *filename, size_t filename_size);
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *data, size_t max_packet_size);
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+                       loff_t offset);
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac06..9e944057001b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback {
 /* Inspired by generic filldir in fs/readdir.c */
 static int
-ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
+ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
-                 u64 ino, unsigned int d_type)
+                 loff_t offset, u64 ino, unsigned int d_type)
 {
-        struct ecryptfs_crypt_stat *crypt_stat;
        struct ecryptfs_getdents_callback *buf =
            (struct ecryptfs_getdents_callback *)dirent;
+        size_t name_size;
+        char *name;
        int rc;
-        int decoded_length;
-        char *decoded_name;
-        crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
        buf->filldir_called++;
-        decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
+        rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
-                                                  &decoded_name);
+                                                  buf->dentry, lower_name,
-        if (decoded_length < 0) {
+                                                  lower_namelen);
-                rc = decoded_length;
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to decode and decrypt "
+                       "filename [%s]; rc = [%d]\n", __func__, lower_name,
+                       rc);
                goto out;
        }
-        rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
+        rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
-                          ino, d_type);
+        kfree(name);
-        kfree(decoded_name);
        if (rc >= 0)
                buf->entries_written++;
 out:
@@ -106,8 +106,8 @@ out:
 /**
 * ecryptfs_readdir
- * @file: The ecryptfs file struct
+ * @file: The eCryptfs directory file
- * @dirent: Directory entry
+ * @dirent: Directory entry handle
 * @filldir: The filldir callback function
 */
 static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-        struct file *lower_file = ecryptfs_file_to_lower(file);
+        return vfs_fsync(ecryptfs_file_to_lower(file),
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+                         ecryptfs_dentry_to_lower(dentry),
-        struct inode *lower_inode = lower_dentry->d_inode;
+                         datasync);
-        int rc = -EINVAL;
-        if (lower_inode->i_fop->fsync) {
-                mutex_lock(&lower_inode->i_mutex);
-                rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
-                                               datasync);
-                mutex_unlock(&lower_inode->i_mutex);
-        }
-        return rc;
 }
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 89209f00f9c7..5697899a168d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir)
 /**
 * ecryptfs_create_underlying_file
 * @lower_dir_inode: inode of the parent in the lower fs of the new file
- * @lower_dentry: New file's dentry in the lower fs
+ * @dentry: New file's dentry
- * @ecryptfs_dentry: New file's dentry in ecryptfs
 * @mode: The mode of the new file
 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
 *
@@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 {
        int rc;
-        /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens
+        /* ecryptfs_do_create() calls ecryptfs_interpose() */
-         * the crypt_stat->lower_file (persistent file) */
        rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
        if (unlikely(rc)) {
                ecryptfs_printk(KERN_WARNING, "Failed to create file in"
@@ -244,141 +242,91 @@ out:
 }
 /**
- * ecryptfs_lookup
+ * ecryptfs_lookup_and_interpose_lower - Perform a lookup
- * @dir: inode
- * @dentry: The dentry
- * @nd: nameidata, may be NULL
- *
- * Find a file on disk. If the file does not exist, then we'll add it to the
- * dentry cache and continue on to read it from the disk.
 */
-static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
-                                      struct nameidata *nd)
+                                        struct dentry *lower_dentry,
+                                        struct ecryptfs_crypt_stat *crypt_stat,
+                                        struct inode *ecryptfs_dir_inode,
+                                        struct nameidata *ecryptfs_nd)
 {
-        int rc = 0;
        struct dentry *lower_dir_dentry;
-        struct dentry *lower_dentry;
        struct vfsmount *lower_mnt;
-        char *encoded_name;
+        struct inode *lower_inode;
-        int encoded_namelen;
-        struct ecryptfs_crypt_stat *crypt_stat = NULL;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
        char *page_virt = NULL;
-        struct inode *lower_inode;
        u64 file_size;
+        int rc = 0;
-        lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+        lower_dir_dentry = lower_dentry->d_parent;
-        dentry->d_op = &ecryptfs_dops;
+        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-        if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
+                                   ecryptfs_dentry->d_parent));
-            || (dentry->d_name.len == 2
-                && !strcmp(dentry->d_name.name, ".."))) {
-                d_drop(dentry);
-                goto out;
-        }
-        encoded_namelen = ecryptfs_encode_filename(crypt_stat,
-                                                   dentry->d_name.name,
-                                                   dentry->d_name.len,
-                                                   &encoded_name);
-        if (encoded_namelen < 0) {
-                rc = encoded_namelen;
-                d_drop(dentry);
-                goto out;
-        }
-        ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
-                        "= [%d]\n", encoded_name, encoded_namelen);
-        lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
-                                      encoded_namelen - 1);
-        kfree(encoded_name);
-        if (IS_ERR(lower_dentry)) {
-                ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
-                rc = PTR_ERR(lower_dentry);
-                d_drop(dentry);
-                goto out;
-        }
-        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
-        ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
-                "d_name.name = [%s]\n", lower_dentry,
-                lower_dentry->d_name.name);
        lower_inode = lower_dentry->d_inode;
-        fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode);
+        fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
        BUG_ON(!atomic_read(&lower_dentry->d_count));
-        ecryptfs_set_dentry_private(dentry,
+        ecryptfs_set_dentry_private(ecryptfs_dentry,
                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
                                                     GFP_KERNEL));
-        if (!ecryptfs_dentry_to_private(dentry)) {
+        if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
                rc = -ENOMEM;
-                ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
+                printk(KERN_ERR "%s: Out of memory whilst attempting "
-                                "to allocate ecryptfs_dentry_info struct\n");
+                       "to allocate ecryptfs_dentry_info struct\n",
+                        __func__);
                goto out_dput;
        }
-        ecryptfs_set_dentry_lower(dentry, lower_dentry);
+        ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
-        ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+        ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
        if (!lower_dentry->d_inode) {
                /* We want to add because we couldn't find in lower */
-                d_add(dentry, NULL);
+                d_add(ecryptfs_dentry, NULL);
                goto out;
        }
-        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
+        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
-                                ECRYPTFS_INTERPOSE_FLAG_D_ADD);
+                                ecryptfs_dir_inode->i_sb, 1);
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error interposing\n");
+                printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
+                       __func__, rc);
                goto out;
        }
-        if (S_ISDIR(lower_inode->i_mode)) {
+        if (S_ISDIR(lower_inode->i_mode))
-                ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
                goto out;
-        }
+        if (S_ISLNK(lower_inode->i_mode))
-        if (S_ISLNK(lower_inode->i_mode)) {
-                ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
                goto out;
-        }
+        if (special_file(lower_inode->i_mode))
-        if (special_file(lower_inode->i_mode)) {
-                ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
                goto out;
-        }
+        if (!ecryptfs_nd)
-        if (!nd) {
-                ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
-                                "as we *think* we are about to unlink\n");
                goto out;
-        }
        /* Released in this function */
-        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2,
+        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
-                                      GFP_USER);
        if (!page_virt) {
+                printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
+                       __func__);
                rc = -ENOMEM;
-                ecryptfs_printk(KERN_ERR,
-                                "Cannot ecryptfs_kmalloc a page\n");
                goto out;
        }
-        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
-        if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                ecryptfs_set_default_sizes(crypt_stat);
-        if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
-                rc = ecryptfs_init_persistent_file(dentry);
                if (rc) {
                        printk(KERN_ERR "%s: Error attempting to initialize "
                               "the persistent file for the dentry with name "
                               "[%s]; rc = [%d]\n", __func__,
-                               dentry->d_name.name, rc);
+                               ecryptfs_dentry->d_name.name, rc);
-                        goto out;
+                        goto out_free_kmem;
                }
        }
        rc = ecryptfs_read_and_validate_header_region(page_virt,
-                                                      dentry->d_inode);
+                                                      ecryptfs_dentry->d_inode);
        if (rc) {
-                rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry);
+                rc = ecryptfs_read_and_validate_xattr_region(page_virt,
+                                                             ecryptfs_dentry);
                if (rc) {
-                        printk(KERN_DEBUG "Valid metadata not found in header "
-                               "region or xattr region; treating file as "
-                               "unencrypted\n");
                        rc = 0;
-                        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+                        goto out_free_kmem;
-                        goto out;
                }
                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
        }
        mount_crypt_stat = &ecryptfs_superblock_to_private(
-                dentry->d_sb)->mount_crypt_stat;
+                ecryptfs_dentry->d_sb)->mount_crypt_stat;
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
                if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
                        file_size = (crypt_stat->num_header_bytes_at_front
@@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
        } else {
                file_size = get_unaligned_be64(page_virt);
        }
-        i_size_write(dentry->d_inode, (loff_t)file_size);
+        i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
+out_free_kmem:
        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
        goto out;
 out_dput:
        dput(lower_dentry);
-        d_drop(dentry);
+        d_drop(ecryptfs_dentry);
 out:
+        return rc;
+}
+/**
+ * ecryptfs_lookup
+ * @ecryptfs_dir_inode: The eCryptfs directory inode
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @ecryptfs_nd: nameidata; may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
+                                      struct dentry *ecryptfs_dentry,
+                                      struct nameidata *ecryptfs_nd)
+{
+        char *encrypted_and_encoded_name = NULL;
+        size_t encrypted_and_encoded_name_size;
+        struct ecryptfs_crypt_stat *crypt_stat = NULL;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+        struct ecryptfs_inode_info *inode_info;
+        struct dentry *lower_dir_dentry, *lower_dentry;
+        int rc = 0;
+        ecryptfs_dentry->d_op = &ecryptfs_dops;
+        if ((ecryptfs_dentry->d_name.len == 1
+             && !strcmp(ecryptfs_dentry->d_name.name, "."))
+            || (ecryptfs_dentry->d_name.len == 2
+                && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
+                goto out_d_drop;
+        }
+        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+                                      lower_dir_dentry,
+                                      ecryptfs_dentry->d_name.len);
+        if (IS_ERR(lower_dentry)) {
+                rc = PTR_ERR(lower_dentry);
+                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                       "lower_dentry = [%s]\n", __func__, rc,
+                       ecryptfs_dentry->d_name.name);
+                goto out_d_drop;
+        }
+        if (lower_dentry->d_inode)
+                goto lookup_and_interpose;
+        inode_info =  ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+        if (inode_info) {
+                crypt_stat = &inode_info->crypt_stat;
+                /* TODO: lock for crypt_stat comparison */
+                if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+                        ecryptfs_set_default_sizes(crypt_stat);
+        }
+        if (crypt_stat)
+                mount_crypt_stat = crypt_stat->mount_crypt_stat;
+        else
+                mount_crypt_stat = &ecryptfs_superblock_to_private(
+                        ecryptfs_dentry->d_sb)->mount_crypt_stat;
+        if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+            && !(mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+                goto lookup_and_interpose;
+        dput(lower_dentry);
+        rc = ecryptfs_encrypt_and_encode_filename(
+                &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
+                crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+                ecryptfs_dentry->d_name.len);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to encrypt and encode "
+                       "filename; rc = [%d]\n", __func__, rc);
+                goto out_d_drop;
+        }
+        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+                                      lower_dir_dentry,
+                                      encrypted_and_encoded_name_size - 1);
+        if (IS_ERR(lower_dentry)) {
+                rc = PTR_ERR(lower_dentry);
+                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                       "lower_dentry = [%s]\n", __func__, rc,
+                       encrypted_and_encoded_name);
+                goto out_d_drop;
+        }
+lookup_and_interpose:
+        rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
+                                                 crypt_stat, ecryptfs_dir_inode,
+                                                 ecryptfs_nd);
+        goto out;
+out_d_drop:
+        d_drop(ecryptfs_dentry);
+out:
+        kfree(encrypted_and_encoded_name);
        return ERR_PTR(rc);
 }
@@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
        struct dentry *lower_dentry;
        struct dentry *lower_dir_dentry;
        char *encoded_symname;
-        int encoded_symlen;
+        size_t encoded_symlen;
-        struct ecryptfs_crypt_stat *crypt_stat = NULL;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        dget(lower_dentry);
        lower_dir_dentry = lock_parent(lower_dentry);
-        encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
-                                                  strlen(symname),
+                dir->i_sb)->mount_crypt_stat;
-                                                  &encoded_symname);
+        rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
-        if (encoded_symlen < 0) {
+                                                  &encoded_symlen,
-                rc = encoded_symlen;
+                                                  NULL,
+                                                  mount_crypt_stat, symname,
+                                                  strlen(symname));
+        if (rc)
                goto out_lock;
-        }
        rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
                         encoded_symname);
        kfree(encoded_symname);
@@ -602,53 +641,54 @@ out_lock:
 }
 static int
-ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 {
-        int rc;
-        struct dentry *lower_dentry;
-        char *decoded_name;
        char *lower_buf;
-        mm_segment_t old_fs;
+        struct dentry *lower_dentry;
        struct ecryptfs_crypt_stat *crypt_stat;
+        char *plaintext_name;
+        size_t plaintext_name_size;
+        mm_segment_t old_fs;
+        int rc;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        if (!lower_dentry->d_inode->i_op ||
+        if (!lower_dentry->d_inode->i_op->readlink) {
-            !lower_dentry->d_inode->i_op->readlink) {
                rc = -EINVAL;
                goto out;
        }
+        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
        /* Released in this function */
        lower_buf = kmalloc(bufsiz, GFP_KERNEL);
        if (lower_buf == NULL) {
-                ecryptfs_printk(KERN_ERR, "Out of memory\n");
+                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+                       "kmalloc [%d] bytes\n", __func__, bufsiz);
                rc = -ENOMEM;
                goto out;
        }
        old_fs = get_fs();
        set_fs(get_ds());
-        ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-                        "lower_dentry->d_name.name = [%s]\n",
-                        lower_dentry->d_name.name);
        rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
                                                   (char __user *)lower_buf,
                                                   bufsiz);
        set_fs(old_fs);
        if (rc >= 0) {
-                crypt_stat = NULL;
+                rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
-                rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
+                                                          &plaintext_name_size,
-                                              &decoded_name);
+                                                          dentry, lower_buf,
-                if (rc == -ENOMEM)
+                                                          rc);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to decode and "
+                               "decrypt filename; rc = [%d]\n", __func__,
+                                rc);
                        goto out_free_lower_buf;
-                if (rc > 0) {
-                        ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
-                                        "to userspace: [%*s]\n", rc,
-                                        decoded_name);
-                        if (copy_to_user(buf, decoded_name, rc))
-                                rc = -EFAULT;
                }
-                kfree(decoded_name);
+                rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
-                fsstack_copy_attr_atime(dentry->d_inode,
+                if (rc)
-                                        lower_dentry->d_inode);
+                        rc = -EFAULT;
+                else
+                        rc = plaintext_name_size;
+                kfree(plaintext_name);
+                fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
        }
 out_free_lower_buf:
        kfree(lower_buf);
@@ -670,13 +710,12 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        }
        old_fs = get_fs();
        set_fs(get_ds());
-        ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-                        "dentry->d_name.name = [%s]\n", dentry->d_name.name);
        rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
-        buf[rc] = '\0';
        set_fs(old_fs);
        if (rc < 0)
                goto out_free;
+        else
+                buf[rc] = '\0';
        rc = 0;
        nd_set_link(nd, buf);
        goto out;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 0d713b691941..ff539420cc6f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
        /* verify that everything through the encrypted FEK size is present */
        if (message_len < 4) {
                rc = -EIO;
-                printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable "
+                printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
                       "message length is [%d]\n", __func__, message_len, 4);
                goto out;
        }
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
        i += data_len;
        if (message_len < (i + key_rec->enc_key_size)) {
                rc = -EIO;
-                printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n",
+                printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
                       __func__, message_len, (i + key_rec->enc_key_size));
                goto out;
        }
        if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
                rc = -EIO;
-                printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than "
+                printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
                       "the maximum key size [%d]\n", __func__,
                       key_rec->enc_key_size,
                       ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
@@ -403,6 +403,580 @@ out:
 }
 static int
+ecryptfs_find_global_auth_tok_for_sig(
+        struct ecryptfs_global_auth_tok **global_auth_tok,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
+{
+        struct ecryptfs_global_auth_tok *walker;
+        int rc = 0;
+        (*global_auth_tok) = NULL;
+        mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+        list_for_each_entry(walker,
+                            &mount_crypt_stat->global_auth_tok_list,
+                            mount_crypt_stat_list) {
+                if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
+                        (*global_auth_tok) = walker;
+                        goto out;
+                }
+        }
+        rc = -EINVAL;
+out:
+        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+        return rc;
+}
+/**
+ * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok: Set to the matching auth_tok; NULL if not found
+ * @crypt_stat: inode crypt_stat crypto context
+ * @sig: Sig of auth_tok to find
+ *
+ * For now, this function simply looks at the registered auth_tok's
+ * linked off the mount_crypt_stat, so all the auth_toks that can be
+ * used must be registered at mount time. This function could
+ * potentially try a lot harder to find auth_tok's (e.g., by calling
+ * out to ecryptfsd to dynamically retrieve an auth_tok object) so
+ * that static registration of auth_tok's will no longer be necessary.
+ *
+ * Returns zero on no error; non-zero on error
+ */
+static int
+ecryptfs_find_auth_tok_for_sig(
+        struct ecryptfs_auth_tok **auth_tok,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+        char *sig)
+{
+        struct ecryptfs_global_auth_tok *global_auth_tok;
+        int rc = 0;
+        (*auth_tok) = NULL;
+        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
+                                                  mount_crypt_stat, sig)) {
+                struct key *auth_tok_key;
+                rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+                                                       sig);
+        } else
+                (*auth_tok) = global_auth_tok->global_auth_tok;
+        return rc;
+}
+/**
+ * write_tag_70_packet can gobble a lot of stack space. We stuff most
+ * of the function's parameters in a kmalloc'd struct to help reduce
+ * eCryptfs' overall stack usage.
+ */
+struct ecryptfs_write_tag_70_packet_silly_stack {
+        u8 cipher_code;
+        size_t max_packet_size;
+        size_t packet_size_len;
+        size_t block_aligned_filename_size;
+        size_t block_size;
+        size_t i;
+        size_t j;
+        size_t num_rand_bytes;
+        struct mutex *tfm_mutex;
+        char *block_aligned_filename;
+        struct ecryptfs_auth_tok *auth_tok;
+        struct scatterlist src_sg;
+        struct scatterlist dst_sg;
+        struct blkcipher_desc desc;
+        char iv[ECRYPTFS_MAX_IV_BYTES];
+        char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+        char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+        struct hash_desc hash_desc;
+        struct scatterlist hash_sg;
+};
+/**
+ * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
+ * @filename: NULL-terminated filename string
+ *
+ * This is the simplest mechanism for achieving filename encryption in
+ * eCryptfs. It encrypts the given filename with the mount-wide
+ * filename encryption key (FNEK) and stores it in a packet to @dest,
+ * which the callee will encode and write directly into the dentry
+ * name.
+ */
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *filename, size_t filename_size)
+{
+        struct ecryptfs_write_tag_70_packet_silly_stack *s;
+        int rc = 0;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s) {
+                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                goto out;
+        }
+        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        (*packet_size) = 0;
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
+                &s->desc.tfm,
+                &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
+        if (unlikely(rc)) {
+                printk(KERN_ERR "Internal error whilst attempting to get "
+                       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+                       mount_crypt_stat->global_default_fn_cipher_name, rc);
+                goto out;
+        }
+        mutex_lock(s->tfm_mutex);
+        s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+        /* Plus one for the \0 separator between the random prefix
+         * and the plaintext filename */
+        s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
+        s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
+        if ((s->block_aligned_filename_size % s->block_size) != 0) {
+                s->num_rand_bytes += (s->block_size
+                                      - (s->block_aligned_filename_size
+                                         % s->block_size));
+                s->block_aligned_filename_size = (s->num_rand_bytes
+                                                  + filename_size);
+        }
+        /* Octet 0: Tag 70 identifier
+         * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+         *              and block-aligned encrypted filename size)
+         * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+         * Octet N2-N3: Cipher identifier (1 octet)
+         * Octets N3-N4: Block-aligned encrypted filename
+         *  - Consists of a minimum number of random characters, a \0
+         *    separator, and then the filename */
+        s->max_packet_size = (1                   /* Tag 70 identifier */
+                              + 3                 /* Max Tag 70 packet size */
+                              + ECRYPTFS_SIG_SIZE /* FNEK sig */
+                              + 1                 /* Cipher identifier */
+                              + s->block_aligned_filename_size);
+        if (dest == NULL) {
+                (*packet_size) = s->max_packet_size;
+                goto out_unlock;
+        }
+        if (s->max_packet_size > (*remaining_bytes)) {
+                printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
+                       "[%zd] available\n", __func__, s->max_packet_size,
+                       (*remaining_bytes));
+                rc = -EINVAL;
+                goto out_unlock;
+        }
+        s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
+                                            GFP_KERNEL);
+        if (!s->block_aligned_filename) {
+                printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+                       "kzalloc [%zd] bytes\n", __func__,
+                       s->block_aligned_filename_size);
+                rc = -ENOMEM;
+                goto out_unlock;
+        }
+        s->i = 0;
+        dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
+        rc = ecryptfs_write_packet_length(&dest[s->i],
+                                          (ECRYPTFS_SIG_SIZE
+                                           + 1 /* Cipher code */
+                                           + s->block_aligned_filename_size),
+                                          &s->packet_size_len);
+        if (rc) {
+                printk(KERN_ERR "%s: Error generating tag 70 packet "
+                       "header; cannot generate packet length; rc = [%d]\n",
+                       __func__, rc);
+                goto out_free_unlock;
+        }
+        s->i += s->packet_size_len;
+        ecryptfs_from_hex(&dest[s->i],
+                          mount_crypt_stat->global_default_fnek_sig,
+                          ECRYPTFS_SIG_SIZE);
+        s->i += ECRYPTFS_SIG_SIZE;
+        s->cipher_code = ecryptfs_code_for_cipher_string(
+                mount_crypt_stat->global_default_fn_cipher_name,
+                mount_crypt_stat->global_default_fn_cipher_key_bytes);
+        if (s->cipher_code == 0) {
+                printk(KERN_WARNING "%s: Unable to generate code for "
+                       "cipher [%s] with key bytes [%zd]\n", __func__,
+                       mount_crypt_stat->global_default_fn_cipher_name,
+                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                rc = -EINVAL;
+                goto out_free_unlock;
+        }
+        dest[s->i++] = s->cipher_code;
+        rc = ecryptfs_find_auth_tok_for_sig(
+                &s->auth_tok, mount_crypt_stat,
+                mount_crypt_stat->global_default_fnek_sig);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to find auth tok for "
+                       "fnek sig [%s]; rc = [%d]\n", __func__,
+                       mount_crypt_stat->global_default_fnek_sig, rc);
+                goto out_free_unlock;
+        }
+        /* TODO: Support other key modules than passphrase for
+         * filename encryption */
+        BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+        sg_init_one(
+                &s->hash_sg,
+                (u8 *)s->auth_tok->token.password.session_key_encryption_key,
+                s->auth_tok->token.password.session_key_encryption_key_bytes);
+        s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
+                                             CRYPTO_ALG_ASYNC);
+        if (IS_ERR(s->hash_desc.tfm)) {
+                        rc = PTR_ERR(s->hash_desc.tfm);
+                        printk(KERN_ERR "%s: Error attempting to "
+                               "allocate hash crypto context; rc = [%d]\n",
+                               __func__, rc);
+                        goto out_free_unlock;
+        }
+        rc = crypto_hash_init(&s->hash_desc);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error initializing crypto hash; rc = [%d]\n",
+                       __func__, rc);
+                goto out_release_free_unlock;
+        }
+        rc = crypto_hash_update(
+                &s->hash_desc, &s->hash_sg,
+                s->auth_tok->token.password.session_key_encryption_key_bytes);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error updating crypto hash; rc = [%d]\n",
+                       __func__, rc);
+                goto out_release_free_unlock;
+        }
+        rc = crypto_hash_final(&s->hash_desc, s->hash);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error finalizing crypto hash; rc = [%d]\n",
+                       __func__, rc);
+                goto out_release_free_unlock;
+        }
+        for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
+                s->block_aligned_filename[s->j] =
+                        s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
+                if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
+                    == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
+                        sg_init_one(&s->hash_sg, (u8 *)s->hash,
+                                    ECRYPTFS_TAG_70_DIGEST_SIZE);
+                        rc = crypto_hash_init(&s->hash_desc);
+                        if (rc) {
+                                printk(KERN_ERR
+                                       "%s: Error initializing crypto hash; "
+                                       "rc = [%d]\n", __func__, rc);
+                                goto out_release_free_unlock;
+                        }
+                        rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
+                                                ECRYPTFS_TAG_70_DIGEST_SIZE);
+                        if (rc) {
+                                printk(KERN_ERR
+                                       "%s: Error updating crypto hash; "
+                                       "rc = [%d]\n", __func__, rc);
+                                goto out_release_free_unlock;
+                        }
+                        rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
+                        if (rc) {
+                                printk(KERN_ERR
+                                       "%s: Error finalizing crypto hash; "
+                                       "rc = [%d]\n", __func__, rc);
+                                goto out_release_free_unlock;
+                        }
+                        memcpy(s->hash, s->tmp_hash,
+                               ECRYPTFS_TAG_70_DIGEST_SIZE);
+                }
+                if (s->block_aligned_filename[s->j] == '\0')
+                        s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
+        }
+        memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
+               filename_size);
+        rc = virt_to_scatterlist(s->block_aligned_filename,
+                                 s->block_aligned_filename_size, &s->src_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_release_free_unlock;
+        }
+        rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
+                                 &s->dst_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert encrypted filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_release_free_unlock;
+        }
+        /* The characters in the first block effectively do the job
+         * of the IV here, so we just use 0's for the IV. Note the
+         * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+         * >= ECRYPTFS_MAX_IV_BYTES. */
+        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+        s->desc.info = s->iv;
+        rc = crypto_blkcipher_setkey(
+                s->desc.tfm,
+                s->auth_tok->token.password.session_key_encryption_key,
+                mount_crypt_stat->global_default_fn_cipher_key_bytes);
+        if (rc < 0) {
+                printk(KERN_ERR "%s: Error setting key for crypto context; "
+                       "rc = [%d]. s->auth_tok->token.password.session_key_"
+                       "encryption_key = [0x%p]; mount_crypt_stat->"
+                       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+                       rc,
+                       s->auth_tok->token.password.session_key_encryption_key,
+                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                goto out_release_free_unlock;
+        }
+        rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+                                         s->block_aligned_filename_size);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to encrypt filename; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out_release_free_unlock;
+        }
+        s->i += s->block_aligned_filename_size;
+        (*packet_size) = s->i;
+        (*remaining_bytes) -= (*packet_size);
+out_release_free_unlock:
+        crypto_free_hash(s->hash_desc.tfm);
+out_free_unlock:
+        memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
+        kfree(s->block_aligned_filename);
+out_unlock:
+        mutex_unlock(s->tfm_mutex);
+out:
+        kfree(s);
+        return rc;
+}
+struct ecryptfs_parse_tag_70_packet_silly_stack {
+        u8 cipher_code;
+        size_t max_packet_size;
+        size_t packet_size_len;
+        size_t parsed_tag_70_packet_size;
+        size_t block_aligned_filename_size;
+        size_t block_size;
+        size_t i;
+        struct mutex *tfm_mutex;
+        char *decrypted_filename;
+        struct ecryptfs_auth_tok *auth_tok;
+        struct scatterlist src_sg;
+        struct scatterlist dst_sg;
+        struct blkcipher_desc desc;
+        char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
+        char iv[ECRYPTFS_MAX_IV_BYTES];
+        char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+};
+/**
+ * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * @filename: This function kmalloc's the memory for the filename
+ * @filename_size: This function sets this to the amount of memory
+ *                 kmalloc'd for the filename
+ * @packet_size: This function sets this to the the number of octets
+ *               in the packet parsed
+ * @mount_crypt_stat: The mount-wide cryptographic context
+ * @data: The memory location containing the start of the tag 70
+ *        packet
+ * @max_packet_size: The maximum legal size of the packet to be parsed
+ *                   from @data
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *data, size_t max_packet_size)
+{
+        struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+        int rc = 0;
+        (*packet_size) = 0;
+        (*filename_size) = 0;
+        (*filename) = NULL;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s) {
+                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                goto out;
+        }
+        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+                printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
+                       "at least [%d]\n", __func__, max_packet_size,
+                        (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+                rc = -EINVAL;
+                goto out;
+        }
+        /* Octet 0: Tag 70 identifier
+         * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+         *              and block-aligned encrypted filename size)
+         * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+         * Octet N2-N3: Cipher identifier (1 octet)
+         * Octets N3-N4: Block-aligned encrypted filename
+         *  - Consists of a minimum number of random numbers, a \0
+         *    separator, and then the filename */
+        if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
+                printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
+                       "tag [0x%.2x]\n", __func__,
+                       data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
+                rc = -EINVAL;
+                goto out;
+        }
+        rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
+                                          &s->parsed_tag_70_packet_size,
+                                          &s->packet_size_len);
+        if (rc) {
+                printk(KERN_WARNING "%s: Error parsing packet length; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out;
+        }
+        s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
+                                          - ECRYPTFS_SIG_SIZE - 1);
+        if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
+            > max_packet_size) {
+                printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
+                       "size is [%zd]\n", __func__, max_packet_size,
+                       (1 + s->packet_size_len + 1
+                        + s->block_aligned_filename_size));
+                rc = -EINVAL;
+                goto out;
+        }
+        (*packet_size) += s->packet_size_len;
+        ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
+                        ECRYPTFS_SIG_SIZE);
+        s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+        (*packet_size) += ECRYPTFS_SIG_SIZE;
+        s->cipher_code = data[(*packet_size)++];
+        rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
+        if (rc) {
+                printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
+                       __func__, s->cipher_code);
+                goto out;
+        }
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+                                                        &s->tfm_mutex,
+                                                        s->cipher_string);
+        if (unlikely(rc)) {
+                printk(KERN_ERR "Internal error whilst attempting to get "
+                       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+                       s->cipher_string, rc);
+                goto out;
+        }
+        mutex_lock(s->tfm_mutex);
+        rc = virt_to_scatterlist(&data[(*packet_size)],
+                                 s->block_aligned_filename_size, &s->src_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert encrypted filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_unlock;
+        }
+        (*packet_size) += s->block_aligned_filename_size;
+        s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
+                                        GFP_KERNEL);
+        if (!s->decrypted_filename) {
+                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+                       "kmalloc [%zd] bytes\n", __func__,
+                       s->block_aligned_filename_size);
+                rc = -ENOMEM;
+                goto out_unlock;
+        }
+        rc = virt_to_scatterlist(s->decrypted_filename,
+                                 s->block_aligned_filename_size, &s->dst_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert decrypted filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_free_unlock;
+        }
+        /* The characters in the first block effectively do the job of
+         * the IV here, so we just use 0's for the IV. Note the
+         * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+         * >= ECRYPTFS_MAX_IV_BYTES. */
+        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+        s->desc.info = s->iv;
+        rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+                                            s->fnek_sig_hex);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to find auth tok for "
+                       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
+                       rc);
+                goto out_free_unlock;
+        }
+        /* TODO: Support other key modules than passphrase for
+         * filename encryption */
+        BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+        rc = crypto_blkcipher_setkey(
+                s->desc.tfm,
+                s->auth_tok->token.password.session_key_encryption_key,
+                mount_crypt_stat->global_default_fn_cipher_key_bytes);
+        if (rc < 0) {
+                printk(KERN_ERR "%s: Error setting key for crypto context; "
+                       "rc = [%d]. s->auth_tok->token.password.session_key_"
+                       "encryption_key = [0x%p]; mount_crypt_stat->"
+                       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+                       rc,
+                       s->auth_tok->token.password.session_key_encryption_key,
+                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                goto out_free_unlock;
+        }
+        rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+                                         s->block_aligned_filename_size);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to decrypt filename; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out_free_unlock;
+        }
+        s->i = 0;
+        while (s->decrypted_filename[s->i] != '\0'
+               && s->i < s->block_aligned_filename_size)
+                s->i++;
+        if (s->i == s->block_aligned_filename_size) {
+                printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
+                       "find valid separator between random characters and "
+                       "the filename\n", __func__);
+                rc = -EINVAL;
+                goto out_free_unlock;
+        }
+        s->i++;
+        (*filename_size) = (s->block_aligned_filename_size - s->i);
+        if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
+                printk(KERN_WARNING "%s: Filename size is [%zd], which is "
+                       "invalid\n", __func__, (*filename_size));
+                rc = -EINVAL;
+                goto out_free_unlock;
+        }
+        (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
+        if (!(*filename)) {
+                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+                       "kmalloc [%zd] bytes\n", __func__,
+                       ((*filename_size) + 1));
+                rc = -ENOMEM;
+                goto out_free_unlock;
+        }
+        memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
+        (*filename)[(*filename_size)] = '\0';
+out_free_unlock:
+        kfree(s->decrypted_filename);
+out_unlock:
+        mutex_unlock(s->tfm_mutex);
+out:
+        if (rc) {
+                (*packet_size) = 0;
+                (*filename_size) = 0;
+                (*filename) = NULL;
+        }
+        kfree(s);
+        return rc;
+}
+static int
 ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
 {
        int rc = 0;
@@ -897,30 +1471,6 @@ out:
        return rc;
 }
-static int
-ecryptfs_find_global_auth_tok_for_sig(
-        struct ecryptfs_global_auth_tok **global_auth_tok,
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
-{
-        struct ecryptfs_global_auth_tok *walker;
-        int rc = 0;
-        (*global_auth_tok) = NULL;
-        mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
-        list_for_each_entry(walker,
-                            &mount_crypt_stat->global_auth_tok_list,
-                            mount_crypt_stat_list) {
-                if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
-                        (*global_auth_tok) = walker;
-                        goto out;
-                }
-        }
-        rc = -EINVAL;
-out:
-        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
-        return rc;
-}
 /**
 * ecryptfs_verify_version
 * @version: The version number to confirm
@@ -990,43 +1540,6 @@ out:
 }
 /**
- * ecryptfs_find_auth_tok_for_sig
- * @auth_tok: Set to the matching auth_tok; NULL if not found
- * @crypt_stat: inode crypt_stat crypto context
- * @sig: Sig of auth_tok to find
- *
- * For now, this function simply looks at the registered auth_tok's
- * linked off the mount_crypt_stat, so all the auth_toks that can be
- * used must be registered at mount time. This function could
- * potentially try a lot harder to find auth_tok's (e.g., by calling
- * out to ecryptfsd to dynamically retrieve an auth_tok object) so
- * that static registration of auth_tok's will no longer be necessary.
- *
- * Returns zero on no error; non-zero on error
- */
-static int
-ecryptfs_find_auth_tok_for_sig(
-        struct ecryptfs_auth_tok **auth_tok,
-        struct ecryptfs_crypt_stat *crypt_stat, char *sig)
-{
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-                crypt_stat->mount_crypt_stat;
-        struct ecryptfs_global_auth_tok *global_auth_tok;
-        int rc = 0;
-        (*auth_tok) = NULL;
-        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
-                                                  mount_crypt_stat, sig)) {
-                struct key *auth_tok_key;
-                rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
-                                                       sig);
-        } else
-                (*auth_tok) = global_auth_tok->global_auth_tok;
-        return rc;
-}
-/**
 * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
 * @auth_tok: The passphrase authentication token to use to encrypt the FEK
 * @crypt_stat: The cryptographic context
@@ -1256,7 +1769,8 @@ find_next_matching_auth_tok:
                        rc = -EINVAL;
                        goto out_wipe_list;
                }
-                ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat,
+                ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+                                               crypt_stat->mount_crypt_stat,
                                               candidate_auth_tok_sig);
                if (matching_auth_tok) {
                        found_auth_tok = 1;
@@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
        int rc;
        rc = write_tag_66_packet(auth_tok->token.private_key.signature,
-                                 ecryptfs_code_for_cipher_string(crypt_stat),
+                                 ecryptfs_code_for_cipher_string(
+                                         crypt_stat->cipher,
+                                         crypt_stat->key_size),
                                 crypt_stat, &payload, &payload_len);
        if (rc) {
                ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
@@ -1696,7 +2212,8 @@ encrypted_session_key_set:
        dest[(*packet_size)++] = 0x04; /* version 4 */
        /* TODO: Break from RFC2440 so that arbitrary ciphers can be
         * specified with strings */
-        cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
+        cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
+                                                      crypt_stat->key_size);
        if (cipher_code == 0) {
                ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
                                "cipher [%s]\n", crypt_stat->cipher);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fd630713c5c7..789cf2e1be1e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
       ecryptfs_opt_ecryptfs_key_bytes,
       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
-       ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
+       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
+       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
+       ecryptfs_opt_err };
 static const match_table_t tokens = {
        {ecryptfs_opt_sig, "sig=%s"},
@@ -217,6 +219,9 @@ static const match_table_t tokens = {
        {ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
        {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
        {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
+        {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
+        {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
+        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
        {ecryptfs_opt_err, NULL}
 };
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        int rc = 0;
        int sig_set = 0;
        int cipher_name_set = 0;
+        int fn_cipher_name_set = 0;
        int cipher_key_bytes;
        int cipher_key_bytes_set = 0;
+        int fn_cipher_key_bytes;
+        int fn_cipher_key_bytes_set = 0;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
                &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
        substring_t args[MAX_OPT_ARGS];
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        char *sig_src;
        char *cipher_name_dst;
        char *cipher_name_src;
+        char *fn_cipher_name_dst;
+        char *fn_cipher_name_src;
+        char *fnek_dst;
+        char *fnek_src;
        char *cipher_key_bytes_src;
+        char *fn_cipher_key_bytes_src;
        if (!options) {
                rc = -EINVAL;
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                                global_default_cipher_name;
                        strncpy(cipher_name_dst, cipher_name_src,
                                ECRYPTFS_MAX_CIPHER_NAME_SIZE);
-                        ecryptfs_printk(KERN_DEBUG,
+                        cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
-                                        "The mount_crypt_stat "
-                                        "global_default_cipher_name set to: "
-                                        "[%s]\n", cipher_name_dst);
                        cipher_name_set = 1;
                        break;
                case ecryptfs_opt_ecryptfs_key_bytes:
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                                                   &cipher_key_bytes_src, 0);
                        mount_crypt_stat->global_default_cipher_key_size =
                                cipher_key_bytes;
-                        ecryptfs_printk(KERN_DEBUG,
-                                        "The mount_crypt_stat "
-                                        "global_default_cipher_key_size "
-                                        "set to: [%d]\n", mount_crypt_stat->
-                                        global_default_cipher_key_size);
                        cipher_key_bytes_set = 1;
                        break;
                case ecryptfs_opt_passthrough:
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                        mount_crypt_stat->flags |=
                                ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
                        break;
+                case ecryptfs_opt_fnek_sig:
+                        fnek_src = args[0].from;
+                        fnek_dst =
+                                mount_crypt_stat->global_default_fnek_sig;
+                        strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
+                        mount_crypt_stat->global_default_fnek_sig[
+                                ECRYPTFS_SIG_SIZE_HEX] = '\0';
+                        rc = ecryptfs_add_global_auth_tok(
+                                mount_crypt_stat,
+                                mount_crypt_stat->global_default_fnek_sig);
+                        if (rc) {
+                                printk(KERN_ERR "Error attempting to register "
+                                       "global fnek sig [%s]; rc = [%d]\n",
+                                       mount_crypt_stat->global_default_fnek_sig,
+                                       rc);
+                                goto out;
+                        }
+                        mount_crypt_stat->flags |=
+                                (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
+                                 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
+                        break;
+                case ecryptfs_opt_fn_cipher:
+                        fn_cipher_name_src = args[0].from;
+                        fn_cipher_name_dst =
+                                mount_crypt_stat->global_default_fn_cipher_name;
+                        strncpy(fn_cipher_name_dst, fn_cipher_name_src,
+                                ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+                        mount_crypt_stat->global_default_fn_cipher_name[
+                                ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+                        fn_cipher_name_set = 1;
+                        break;
+                case ecryptfs_opt_fn_cipher_key_bytes:
+                        fn_cipher_key_bytes_src = args[0].from;
+                        fn_cipher_key_bytes =
+                                (int)simple_strtol(fn_cipher_key_bytes_src,
+                                                   &fn_cipher_key_bytes_src, 0);
+                        mount_crypt_stat->global_default_fn_cipher_key_bytes =
+                                fn_cipher_key_bytes;
+                        fn_cipher_key_bytes_set = 1;
+                        break;
                case ecryptfs_opt_err:
                default:
-                        ecryptfs_printk(KERN_WARNING,
+                        printk(KERN_WARNING
-                                        "eCryptfs: unrecognized option '%s'\n",
+                               "%s: eCryptfs: unrecognized option [%s]\n",
-                                        p);
+                               __func__, p);
                }
        }
        if (!sig_set) {
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
                BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
                strcpy(mount_crypt_stat->global_default_cipher_name,
                       ECRYPTFS_DEFAULT_CIPHER);
        }
-        if (!cipher_key_bytes_set) {
+        if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+            && !fn_cipher_name_set)
+                strcpy(mount_crypt_stat->global_default_fn_cipher_name,
+                       mount_crypt_stat->global_default_cipher_name);
+        if (!cipher_key_bytes_set)
                mount_crypt_stat->global_default_cipher_key_size = 0;
-        }
+        if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+            && !fn_cipher_key_bytes_set)
+                mount_crypt_stat->global_default_fn_cipher_key_bytes =
+                        mount_crypt_stat->global_default_cipher_key_size;
        mutex_lock(&key_tfm_list_mutex);
        if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
-                                 NULL))
+                                 NULL)) {
                rc = ecryptfs_add_new_key_tfm(
                        NULL, mount_crypt_stat->global_default_cipher_name,
                        mount_crypt_stat->global_default_cipher_key_size);
-        mutex_unlock(&key_tfm_list_mutex);
+                if (rc) {
-        if (rc) {
+                        printk(KERN_ERR "Error attempting to initialize "
-                printk(KERN_ERR "Error attempting to initialize cipher with "
+                               "cipher with name = [%s] and key size = [%td]; "
-                       "name = [%s] and key size = [%td]; rc = [%d]\n",
+                               "rc = [%d]\n",
-                       mount_crypt_stat->global_default_cipher_name,
+                               mount_crypt_stat->global_default_cipher_name,
-                       mount_crypt_stat->global_default_cipher_key_size, rc);
+                               mount_crypt_stat->global_default_cipher_key_size,
-                rc = -EINVAL;
+                               rc);
-                goto out;
+                        rc = -EINVAL;
+                        mutex_unlock(&key_tfm_list_mutex);
+                        goto out;
+                }
        }
+        if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+            && !ecryptfs_tfm_exists(
+                    mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
+                rc = ecryptfs_add_new_key_tfm(
+                        NULL, mount_crypt_stat->global_default_fn_cipher_name,
+                        mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                if (rc) {
+                        printk(KERN_ERR "Error attempting to initialize "
+                               "cipher with name = [%s] and key size = [%td]; "
+                               "rc = [%d]\n",
+                               mount_crypt_stat->global_default_fn_cipher_name,
+                               mount_crypt_stat->global_default_fn_cipher_key_bytes,
+                               rc);
+                        rc = -EINVAL;
+                        mutex_unlock(&key_tfm_list_mutex);
+                        goto out;
+                }
+        }
+        mutex_unlock(&key_tfm_list_mutex);
        rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
-        if (rc) {
+        if (rc)
                printk(KERN_WARNING "One or more global auth toks could not "
                       "properly register; rc = [%d]\n", rc);
-        }
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 6913f727624d..96ef51489e01 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
        (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
        if (!(*daemon)) {
                rc = -ENOMEM;
-                printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+                printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
                       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
                goto out;
        }
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
        msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
        if (!msg_ctx->msg) {
                rc = -ENOMEM;
-                printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+                printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
                       "GFP_KERNEL memory\n", __func__, msg_size);
                goto unlock;
        }
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index efd95a0ed1ea..a67fea655f49 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
                if (!msg_ctx->msg) {
                        rc = -ENOMEM;
                        printk(KERN_ERR "%s: Out of memory whilst attempting "
-                               "to kmalloc(%Zd, GFP_KERNEL)\n", __func__,
+                               "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
                               (sizeof(*msg_ctx->msg) + data_size));
                        goto out_unlock;
                }
@@ -322,7 +322,7 @@ check_list:
        if (count < total_length) {
                rc = 0;
                printk(KERN_WARNING "%s: Only given user buffer of "
-                       "size [%Zd], but we need [%Zd] to read the "
+                       "size [%zd], but we need [%zd] to read the "
                       "pending message\n", __func__, count, total_length);
                goto out_unlock_msg_ctx;
        }
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
        if ((sizeof(*msg) + msg->data_len) != data_size) {
                printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
-                       "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__,
+                       "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
                       (sizeof(*msg) + msg->data_len), data_size);
                rc = -EINVAL;
                goto out;
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        data = kmalloc(count, GFP_KERNEL);
        if (!data) {
                printk(KERN_ERR "%s: Out of memory whilst attempting to "
-                       "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count);
+                       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
                goto out;
        }
        rc = copy_from_user(data, buf, count);
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        case ECRYPTFS_MSG_RESPONSE:
                if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
                        printk(KERN_WARNING "%s: Minimum acceptable packet "
-                               "size is [%Zd], but amount of data written is "
+                               "size is [%zd], but amount of data written is "
-                               "only [%Zd]. Discarding response packet.\n",
+                               "only [%zd]. Discarding response packet.\n",
                               __func__,
                               (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
                               count);
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
                }
                i += packet_size_length;
                if ((1 + 4 + packet_size_length + packet_size) != count) {
-                        printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])"
+                        printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
-                               " + packet_size([%Zd]))([%Zd]) != "
+                               " + packet_size([%zd]))([%zd]) != "
-                               "count([%Zd]). Invalid packet format.\n",
+                               "count([%zd]). Invalid packet format.\n",
                               __func__, packet_size_length, packet_size,
                               (1 + packet_size_length + packet_size), count);
                        goto out_free;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 04d7b3fa1ac6..46cec2b69796 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
        loff_t prev_page_end_size;
        int rc = 0;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
new file mode 100644
index 000000000000..6ebfc1c207a8
--- /dev/null
+++ b/fs/efs/Kconfig
@@ -0,0 +1,14 @@
+config EFS_FS
+        tristate "EFS file system support (read only) (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        help
+          EFS is an older file system used for non-ISO9660 CD-ROMs and hard
+          disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
+          uses the XFS file system for hard disk partitions however).
+          This implementation only offers read-only access. If you don't know
+          what all this is about, it's safe to say N. For more information
+          about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
+          To compile the EFS file system support as a module, choose M here: the
+          module will be called efs.
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08bf558d0408..5de2c2db3aa2 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -198,7 +198,7 @@ struct file *eventfd_fget(int fd)
        return file;
 }
-asmlinkage long sys_eventfd2(unsigned int count, int flags)
+SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
        int fd;
        struct eventfd_ctx *ctx;
@@ -228,8 +228,7 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags)
        return fd;
 }
-asmlinkage long sys_eventfd(unsigned int count)
+SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
        return sys_eventfd2(count, 0);
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 96355d505347..011b9b8c90c6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -234,8 +234,6 @@ struct ep_pqueue {
 /*
 * Configuration options available inside /proc/sys/fs/epoll/
 */
-/* Maximum number of epoll devices, per user */
-static int max_user_instances __read_mostly;
 /* Maximum number of epoll watched descriptors, per user */
 static int max_user_watches __read_mostly;
@@ -261,14 +259,6 @@ static int zero;
 ctl_table epoll_table[] = {
        {
-                .procname       = "max_user_instances",
-                .data           = &max_user_instances,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
-                .extra1         = &zero,
-        },
-        {
                .procname       = "max_user_watches",
                .data           = &max_user_watches,
                .maxlen         = sizeof(int),
@@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep)
        mutex_unlock(&epmutex);
        mutex_destroy(&ep->mtx);
-        atomic_dec(&ep->user->epoll_devs);
        free_uid(ep->user);
        kfree(ep);
 }
@@ -581,10 +570,6 @@ static int ep_alloc(struct eventpoll **pep)
        struct eventpoll *ep;
        user = get_current_user();
-        error = -EMFILE;
-        if (unlikely(atomic_read(&user->epoll_devs) >=
-                        max_user_instances))
-                goto free_uid;
        error = -ENOMEM;
        ep = kzalloc(sizeof(*ep), GFP_KERNEL);
        if (unlikely(!ep))
@@ -1110,7 +1095,7 @@ retry:
 /*
 * Open an eventpoll file descriptor.
 */
-asmlinkage long sys_epoll_create1(int flags)
+SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
        int error, fd = -1;
        struct eventpoll *ep;
@@ -1141,7 +1126,6 @@ asmlinkage long sys_epoll_create1(int flags)
                              flags & O_CLOEXEC);
        if (fd < 0)
                ep_free(ep);
-        atomic_inc(&ep->user->epoll_devs);
 error_return:
        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1150,7 +1134,7 @@ error_return:
        return fd;
 }
-asmlinkage long sys_epoll_create(int size)
+SYSCALL_DEFINE1(epoll_create, int, size)
 {
        if (size < 0)
                return -EINVAL;
@@ -1163,8 +1147,8 @@ asmlinkage long sys_epoll_create(int size)
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
-asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
+SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
-                              struct epoll_event __user *event)
+                struct epoll_event __user *, event)
 {
        int error;
        struct file *file, *tfile;
@@ -1261,8 +1245,8 @@ error_return:
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
 */
-asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
-                               int maxevents, int timeout)
+                int, maxevents, int, timeout)
 {
        int error;
        struct file *file;
@@ -1319,9 +1303,9 @@ error_return:
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_pwait(2).
 */
-asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
-                int maxevents, int timeout, const sigset_t __user *sigmask,
+                int, maxevents, int, timeout, const sigset_t __user *, sigmask,
-                size_t sigsetsize)
+                size_t, sigsetsize)
 {
        int error;
        sigset_t ksigmask, sigsaved;
@@ -1366,8 +1350,10 @@ static int __init eventpoll_init(void)
        struct sysinfo si;
        si_meminfo(&si);
-        max_user_instances = 128;
+        /*
-        max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
+         * Allows top 4% of lomem to be allocated for epoll watches (per user).
+         */
+        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
        /* Initialize the structure used to perform safe poll wait head wake ups */
diff --git a/fs/exec.c b/fs/exec.c
index 9c789a525cc4..febfd8ed6ad1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -52,17 +52,13 @@
 #include <linux/audit.h>
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
 #include "internal.h"
-#ifdef __alpha__
-/* for /sbin/loader handling in search_binary_handler() */
-#include <linux/a.out.h>
-#endif
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 int suid_dumpable = 0;
@@ -104,7 +100,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
 *
 * Also note that we take the address to load from from the file itself.
 */
-asmlinkage long sys_uselib(const char __user * library)
+SYSCALL_DEFINE1(uselib, const char __user *, library)
 {
        struct file *file;
        struct nameidata nd;
@@ -128,7 +124,8 @@ asmlinkage long sys_uselib(const char __user * library)
        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
-        error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+        error = inode_permission(nd.path.dentry->d_inode,
+                                 MAY_READ | MAY_EXEC | MAY_OPEN);
        if (error)
                goto exit;
        error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
@@ -140,6 +137,8 @@ asmlinkage long sys_uselib(const char __user * library)
        if (IS_ERR(file))
                goto out;
+        fsnotify_open(file->f_path.dentry);
        error = -ENOEXEC;
        if(file->f_op) {
                struct linux_binfmt * fmt;
@@ -237,13 +236,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 static int __bprm_mm_init(struct linux_binprm *bprm)
 {
-        int err = -ENOMEM;
+        int err;
        struct vm_area_struct *vma = NULL;
        struct mm_struct *mm = bprm->mm;
        bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
        if (!vma)
-                goto err;
+                return -ENOMEM;
        down_write(&mm->mmap_sem);
        vma->vm_mm = mm;
@@ -256,28 +255,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         */
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_STACK_FLAGS;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        err = insert_vm_struct(mm, vma);
-        if (err) {
+        if (err)
-                up_write(&mm->mmap_sem);
                goto err;
-        }
        mm->stack_vm = mm->total_vm = 1;
        up_write(&mm->mmap_sem);
        bprm->p = vma->vm_end - sizeof(void *);
        return 0;
 err:
-        if (vma) {
+        up_write(&mm->mmap_sem);
-                bprm->vma = NULL;
+        bprm->vma = NULL;
-                kmem_cache_free(vm_area_cachep, vma);
+        kmem_cache_free(vm_area_cachep, vma);
-        }
        return err;
 }
@@ -684,7 +675,7 @@ struct file *open_exec(const char *name)
        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
                goto out_path_put;
-        err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+        err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
        if (err)
                goto out_path_put;
        err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
@@ -695,6 +686,8 @@ struct file *open_exec(const char *name)
        if (IS_ERR(file))
                return file;
+        fsnotify_open(file->f_path.dentry);
        err = deny_write_access(file);
        if (err) {
                fput(file);
@@ -1178,41 +1171,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        unsigned int depth = bprm->recursion_depth;
        int try,retval;
        struct linux_binfmt *fmt;
-#ifdef __alpha__
-        /* handle /sbin/loader.. */
-        {
-            struct exec * eh = (struct exec *) bprm->buf;
-            if (!bprm->loader && eh->fh.f_magic == 0x183 &&
-                (eh->fh.f_flags & 0x3000) == 0x3000)
-            {
-                struct file * file;
-                unsigned long loader;
-                allow_write_access(bprm->file);
-                fput(bprm->file);
-                bprm->file = NULL;
-                loader = bprm->vma->vm_end - sizeof(void *);
-                file = open_exec("/sbin/loader");
-                retval = PTR_ERR(file);
-                if (IS_ERR(file))
-                        return retval;
-                /* Remember if the application is TASO.  */
-                bprm->taso = eh->ah.entry < 0x100000000UL;
-                bprm->file = file;
-                bprm->loader = loader;
-                retval = prepare_binprm(bprm);
-                if (retval<0)
-                        return retval;
-                /* should call search_binary_handler recursively here,
-                   but it does not matter */
-            }
-        }
-#endif
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
@@ -1737,7 +1696,7 @@ int get_dumpable(struct mm_struct *mm)
        return (ret >= 2) ? 2 : ret;
 }
-int do_coredump(long signr, int exit_code, struct pt_regs * regs)
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
        char corename[CORENAME_MAX_SIZE + 1];
@@ -1821,6 +1780,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        if (ispipe) {
                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+                if (!helper_argv) {
+                        printk(KERN_WARNING "%s failed to allocate memory\n",
+                               __func__);
+                        goto fail_unlock;
+                }
                /* Terminate the string before the first option */
                delimit = strchr(corename, ' ');
                if (delimit)
@@ -1888,5 +1852,5 @@ fail_unlock:
        put_cred(cred);
        coredump_finish(mm);
 fail:
-        return retval;
+        return;
 }
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 9a0fc400f91c..2999d72153b7 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -95,10 +95,13 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
                mark_inode_dirty(dir);
        }
-        if (IS_DIRSYNC(dir))
+        if (IS_DIRSYNC(dir)) {
                err = write_one_page(page, 1);
-        else
+                if (!err)
+                        err = ext2_sync_inode(dir);
+        } else {
                unlock_page(page);
+        }
        return err;
 }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 8d0add625870..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        memset(ei->i_data, 0, sizeof(ei->i_data));
-        ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
-                ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
-        /* dirsync is only applied to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT2_DIRSYNC_FL;
        ei->i_faddr = 0;
        ei->i_frag_no = 0;
        ei->i_frag_size = 0;
@@ -585,7 +581,10 @@ got:
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EINVAL;
+                goto fail_drop;
+        }
        if (DQUOT_ALLOC_INODE(inode)) {
                err = -EDQUOT;
@@ -612,6 +611,7 @@ fail_drop:
        DQUOT_DROP(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
+        unlock_new_inode(inode);
        iput(inode);
        return ERR_PTR(err);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7658b33e2653..23fff2f87783 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -497,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
 * ext2_splice_branch - splice the allocated branch onto inode.
 * @inode: owner
 * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *      ext2_alloc_branch)
 * @where: location of missing link
 * @num:   number of indirect blocks we are adding
 * @blks:  number of direct blocks we are adding
@@ -1286,9 +1285,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
                else
                        inode->i_mapping->a_ops = &ext2_aops;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext2_inode_is_fast_symlink(inode))
+                if (ext2_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext2_fast_symlink_inode_operations;
-                else {
+                        nd_terminate_link(ei->i_data, inode->i_size,
+                                sizeof(ei->i_data) - 1);
+                } else {
                        inode->i_op = &ext2_symlink_inode_operations;
                        if (test_opt(inode->i_sb, NOBH))
                                inode->i_mapping->a_ops = &ext2_nobh_aops;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        goto setflags_out;
                }
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext2_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT2_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2a747252ec12..90ea17998a73 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
        int err = ext2_add_link(dentry, inode);
        if (!err) {
                d_instantiate(dentry, inode);
+                unlock_new_inode(inode);
                return 0;
        }
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -170,6 +172,7 @@ out:
 out_fail:
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput (inode);
        goto out;
 }
@@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        struct dentry *dentry)
 {
        struct inode *inode = old_dentry->d_inode;
+        int err;
        if (inode->i_nlink >= EXT2_LINK_MAX)
                return -EMLINK;
@@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        inode_inc_link_count(inode);
        atomic_inc(&inode->i_count);
-        return ext2_add_nondir(dentry, inode);
+        err = ext2_add_link(dentry, inode);
+        if (!err) {
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        inode_dec_link_count(inode);
+        iput(inode);
+        return err;
 }
 static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
@@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
                goto out_fail;
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
 out:
        return err;
 out_fail:
        inode_dec_link_count(inode);
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
 out_dir:
        inode_dec_link_count(dir);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
        percpu_counter_destroy(&sbi->s_dirs_counter);
        brelse (sbi->s_sbh);
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_sb_block = sb_block;
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                printk ("EXT2-fs: not enough memory\n");
                goto failed_mount;
        }
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
        if (!sbi->s_debts) {
                printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index c30e149fbd2e..7d215b4d4f2e 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-        __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        const unsigned char *ucp = (const unsigned char *) name;
+        while (len--) {
+                hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+                if (hash & 0x80000000)
+                        hash -= 0x7fffffff;
+                hash1 = hash0;
+                hash0 = hash;
+        }
+        return hash0 << 1;
+}
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        const signed char *scp = (const signed char *) name;
        while (len--) {
-                __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+                hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
-                if (hash & 0x80000000) hash -= 0x7fffffff;
+                if (hash & 0x80000000)
+                        hash -= 0x7fffffff;
                hash1 = hash0;
                hash0 = hash;
        }
-        return (hash0 << 1);
+        return hash0 << 1;
 }
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 {
        __u32   pad, val;
        int     i;
+        const signed char *scp = (const signed char *) msg;
+        pad = (__u32)len | ((__u32)len << 8);
+        pad |= pad << 16;
+        val = pad;
+        if (len > num*4)
+                len = num * 4;
+        for (i = 0; i < len; i++) {
+                if ((i % 4) == 0)
+                        val = pad;
+                val = ((int) scp[i]) + (val << 8);
+                if ((i % 4) == 3) {
+                        *buf++ = val;
+                        val = pad;
+                        num--;
+                }
+        }
+        if (--num >= 0)
+                *buf++ = val;
+        while (--num >= 0)
+                *buf++ = pad;
+}
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+        __u32   pad, val;
+        int     i;
+        const unsigned char *ucp = (const unsigned char *) msg;
        pad = (__u32)len | ((__u32)len << 8);
        pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
        for (i=0; i < len; i++) {
                if ((i % 4) == 0)
                        val = pad;
-                val = msg[i] + (val << 8);
+                val = ((int) ucp[i]) + (val << 8);
                if ((i % 4) == 3) {
                        *buf++ = val;
                        val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        const char      *p;
        int             i;
        __u32           in[8], buf[4];
+        void            (*str2hashbuf)(const char *, int, __u32 *, int) =
+                                str2hashbuf_signed;
        /* Initialize the default seed for the hash checksum functions */
        buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        }
        switch (hinfo->hash_version) {
+        case DX_HASH_LEGACY_UNSIGNED:
+                hash = dx_hack_hash_unsigned(name, len);
+                break;
        case DX_HASH_LEGACY:
-                hash = dx_hack_hash(name, len);
+                hash = dx_hack_hash_signed(name, len);
                break;
+        case DX_HASH_HALF_MD4_UNSIGNED:
+                str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_HALF_MD4:
                p = name;
                while (len > 0) {
-                        str2hashbuf(p, len, in, 8);
+                        (*str2hashbuf)(p, len, in, 8);
                        half_md4_transform(buf, in);
                        len -= 32;
                        p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
                minor_hash = buf[2];
                hash = buf[1];
                break;
+        case DX_HASH_TEA_UNSIGNED:
+                str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_TEA:
                p = name;
                while (len > 0) {
-                        str2hashbuf(p, len, in, 4);
+                        (*str2hashbuf)(p, len, in, 4);
                        TEA_transform(buf, in);
                        len -= 16;
                        p += 16;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 490bd0ed7896..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;
-        ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
-                ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
-        /* dirsync only applies to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT3_DIRSYNC_FL;
 #ifdef EXT3_FRAGMENTS
        ei->i_faddr = 0;
        ei->i_frag_no = 0;
@@ -579,7 +575,10 @@ got:
        ext3_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
                handle->h_sync = 1;
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EINVAL;
+                goto fail_drop;
+        }
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
@@ -627,6 +626,7 @@ fail_drop:
        DQUOT_DROP(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
+        unlock_new_inode(inode);
        iput(inode);
        brelse(bitmap_bh);
        return ERR_PTR(err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f8424ad89971..5fa453b49a64 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -37,6 +37,7 @@
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "xattr.h"
 #include "acl.h"
@@ -1160,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
        to = from + len;
 retry:
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
@@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
                inode->i_op = &ext3_dir_inode_operations;
                inode->i_fop = &ext3_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext3_inode_is_fast_symlink(inode))
+                if (ext3_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext3_fast_symlink_inode_operations;
-                else {
+                        nd_terminate_link(ei->i_data, inode->i_size,
+                                sizeof(ei->i_data) - 1);
+                } else {
                        inode->i_op = &ext3_symlink_inode_operations;
                        ext3_set_aops(inode);
                }
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                        goto flags_out;
                }
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext3_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT3_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3e5edc92aa0b..4db4ffa1edad 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
                goto fail;
        }
        hinfo->hash_version = root->info.hash_version;
+        if (hinfo->hash_version <= DX_HASH_TEA)
+                hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
        hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
        if (entry)
                ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        dir = dir_file->f_path.dentry->d_inode;
        if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
                hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+                if (hinfo.hash_version <= DX_HASH_TEA)
+                        hinfo.hash_version +=
+                                EXT3_SB(dir->i_sb)->s_hash_unsigned;
                hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                               start_hash, start_minor_hash);
@@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        u32 hash2;
        struct dx_map_entry *map;
        char *data1 = (*bh)->b_data, *data2;
-        unsigned split, move, size, i;
+        unsigned split, move, size;
        struct ext3_dir_entry_2 *de = NULL, *de2;
-        int     err = 0;
+        int     err = 0, i;
        bh2 = ext3_append (handle, dir, &newblock, &err);
        if (!(bh2)) {
@@ -1357,7 +1358,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        struct fake_dirent *fde;
        blocksize =  dir->i_sb->s_blocksize;
-        dxtrace(printk("Creating index\n"));
+        dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
        retval = ext3_journal_get_write_access(handle, bh);
        if (retval) {
                ext3_std_error(dir->i_sb, retval);
@@ -1366,6 +1367,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        }
        root = (struct dx_root *) bh->b_data;
+        /* The 0th block becomes the root, move the dirents out */
+        fde = &root->dotdot;
+        de = (struct ext3_dir_entry_2 *)((char *)fde +
+                        ext3_rec_len_from_disk(fde->rec_len));
+        if ((char *) de >= (((char *) root) + blocksize)) {
+                ext3_error(dir->i_sb, __func__,
+                           "invalid rec_len for '..' in inode %lu",
+                           dir->i_ino);
+                brelse(bh);
+                return -EIO;
+        }
+        len = ((char *) root) + blocksize - (char *) de;
        bh2 = ext3_append (handle, dir, &block, &retval);
        if (!(bh2)) {
                brelse(bh);
@@ -1374,11 +1388,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
        data1 = bh2->b_data;
-        /* The 0th block becomes the root, move the dirents out */
-        fde = &root->dotdot;
-        de = (struct ext3_dir_entry_2 *)((char *)fde +
-                        ext3_rec_len_from_disk(fde->rec_len));
-        len = ((char *) root) + blocksize - (char *) de;
        memcpy (data1, de, len);
        de = (struct ext3_dir_entry_2 *) data1;
        top = data1 + len;
@@ -1398,6 +1407,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* Initialize as for dx_probe */
        hinfo.hash_version = root->info.hash_version;
+        if (hinfo.hash_version <= DX_HASH_TEA)
+                hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
        hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
        ext3fs_dirhash(name, namelen, &hinfo);
        frame = frames;
@@ -1652,9 +1663,11 @@ static int ext3_add_nondir(handle_t *handle,
        if (!err) {
                ext3_mark_inode_dirty(handle, inode);
                d_instantiate(dentry, inode);
+                unlock_new_inode(inode);
                return 0;
        }
        drop_nlink(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -1765,6 +1778,7 @@ retry:
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
        if (!dir_block) {
                drop_nlink(inode); /* is this nlink == 0? */
+                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
                iput (inode);
                goto out_stop;
@@ -1792,6 +1806,7 @@ retry:
        err = ext3_add_entry (handle, dentry, inode);
        if (err) {
                inode->i_nlink = 0;
+                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
                iput (inode);
                goto out_stop;
@@ -1800,6 +1815,7 @@ retry:
        ext3_update_dx_flag(dir);
        ext3_mark_inode_dirty(handle, dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
 out_stop:
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
@@ -2170,10 +2186,10 @@ retry:
                 * We have a transaction open.  All is sweetness.  It also sets
                 * i_size in generic_commit_write().
                 */
-                err = __page_symlink(inode, symname, l,
+                err = __page_symlink(inode, symname, l, 1);
-                                mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
                if (err) {
                        drop_nlink(inode);
+                        unlock_new_inode(inode);
                        ext3_mark_inode_dirty(handle, inode);
                        iput (inode);
                        goto out_stop;
@@ -2221,7 +2237,14 @@ retry:
        inc_nlink(inode);
        atomic_inc(&inode->i_count);
-        err = ext3_add_nondir(handle, dentry, inode);
+        err = ext3_add_entry(handle, dentry, inode);
+        if (!err) {
+                ext3_mark_inode_dirty(handle, inode);
+                d_instantiate(dentry, inode);
+        } else {
+                drop_nlink(inode);
+                iput(inode);
+        }
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec1..b70d90e08a3c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
                             unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
                               unsigned int);
-static void ext3_commit_super (struct super_block * sb,
+static int ext3_commit_super(struct super_block *sb,
-                               struct ext3_super_block * es,
+                               struct ext3_super_block *es,
                               int sync);
 static void ext3_mark_recovery_complete(struct super_block * sb,
                                        struct ext3_super_block * es);
@@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
                                     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
-static void ext3_unlockfs(struct super_block *sb);
+static int ext3_unfreeze(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
-static void ext3_write_super_lockfs(struct super_block *sb);
+static int ext3_freeze(struct super_block *sb);
 /*
 * Wrappers for journal_start/end.
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
                ext3_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
 }
@@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
                                    ext3_nfs_get_inode);
 }
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+                                 gfp_t wait)
+{
+        journal_t *journal = EXT3_SB(sb)->s_journal;
+        WARN_ON(PageChecked(page));
+        if (!page_has_buffers(page))
+                return 0;
+        if (journal)
+                return journal_try_to_free_buffers(journal, page, 
+                                                   wait & ~__GFP_WAIT);
+        return try_to_free_buffers(page);
+}
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -713,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = {
        .acquire_dquot  = ext3_acquire_dquot,
        .release_dquot  = ext3_release_dquot,
        .mark_dirty     = ext3_mark_dquot_dirty,
-        .write_info     = ext3_write_info
+        .write_info     = ext3_write_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
 static struct quotactl_ops ext3_qctl_operations = {
@@ -736,8 +759,8 @@ static const struct super_operations ext3_sops = {
        .put_super      = ext3_put_super,
        .write_super    = ext3_write_super,
        .sync_fs        = ext3_sync_fs,
-        .write_super_lockfs = ext3_write_super_lockfs,
+        .freeze_fs      = ext3_freeze,
-        .unlockfs       = ext3_unlockfs,
+        .unfreeze_fs    = ext3_unfreeze,
        .statfs         = ext3_statfs,
        .remount_fs     = ext3_remount,
        .clear_inode    = ext3_clear_inode,
@@ -746,6 +769,7 @@ static const struct super_operations ext3_sops = {
        .quota_read     = ext3_quota_read,
        .quota_write    = ext3_quota_write,
 #endif
+        .bdev_try_to_free_page = bdev_try_to_free_page,
 };
 static const struct export_operations ext3_export_ops = {
@@ -1035,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb,
                case Opt_grpjquota:
                        qtype = GRPQUOTA;
 set_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
                                        "EXT3-fs: Cannot change journaled "
@@ -1075,8 +1098,7 @@ set_qf_name:
                case Opt_offgrpjquota:
                        qtype = GRPQUOTA;
 clear_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR "EXT3-fs: Cannot change "
                                        "journaled quota options when "
@@ -1095,8 +1117,7 @@ clear_qf_name:
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
 set_qf_format:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_jquota_fmt != qfmt) {
                                printk(KERN_ERR "EXT3-fs: Cannot change "
                                        "journaled quota options when "
@@ -1115,8 +1136,7 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
-                        if (sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb)) {
-                            sb_any_quota_suspended(sb)) {
                                printk(KERN_ERR "EXT3-fs: Cannot change quota "
                                        "options when quota turned on.\n");
                                return 0;
@@ -1548,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1744,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        for (i=0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
+        i = le32_to_cpu(es->s_flags);
+        if (i & EXT2_FLAGS_UNSIGNED_HASH)
+                sbi->s_hash_unsigned = 3;
+        else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+                es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+                sbi->s_hash_unsigned = 3;
+#else
+                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+                sb->s_dirt = 1;
+        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
                printk (KERN_ERR
@@ -1788,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logic_sb_block, i);
@@ -2272,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb,
        return 0;
 }
-static void ext3_commit_super (struct super_block * sb,
+static int ext3_commit_super(struct super_block *sb,
-                               struct ext3_super_block * es,
+                               struct ext3_super_block *es,
                               int sync)
 {
        struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
+        int error = 0;
        if (!sbh)
-                return;
+                return error;
        es->s_wtime = cpu_to_le32(get_seconds());
        es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
        es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
        if (sync)
-                sync_dirty_buffer(sbh);
+                error = sync_dirty_buffer(sbh);
+        return error;
 }
@@ -2400,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
 */
-static void ext3_write_super_lockfs(struct super_block *sb)
+static int ext3_freeze(struct super_block *sb)
 {
+        int error = 0;
+        journal_t *journal;
        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY)) {
-                journal_t *journal = EXT3_SB(sb)->s_journal;
+                journal = EXT3_SB(sb)->s_journal;
                /* Now we set up the journal barrier. */
                journal_lock_updates(journal);
@@ -2414,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb)
                 * We don't want to clear needs_recovery flag when we failed
                 * to flush the journal.
                 */
-                if (journal_flush(journal) < 0)
+                error = journal_flush(journal);
-                        return;
+                if (error < 0)
+                        goto out;
                /* Journal blocked and flushed, clear needs_recovery flag. */
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-                ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+                error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+                if (error)
+                        goto out;
        }
+        return 0;
+out:
+        journal_unlock_updates(journal);
+        return error;
 }
 /*
 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
 * flag here, even though the filesystem is not technically dirty yet.
 */
-static void ext3_unlockfs(struct super_block *sb)
+static int ext3_unfreeze(struct super_block *sb)
 {
        if (!(sb->s_flags & MS_RDONLY)) {
                lock_super(sb);
@@ -2437,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb)
                unlock_super(sb);
                journal_unlock_updates(EXT3_SB(sb)->s_journal);
        }
+        return 0;
 }
 static int ext3_remount (struct super_block * sb, int * flags, char * data)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38b3acf5683b..9a50b8052dcf 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "group.h"
+#include "mballoc.h"
 /*
 * balloc.c contains the blocks allocation and deallocation routines
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * essentially implementing a per-group read-only flag. */
                if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
                        ext4_error(sb, __func__,
-                                  "Checksum bad for group %lu\n", block_group);
+                                  "Checksum bad for group %u", block_group);
-                        gdp->bg_free_blocks_count = 0;
+                        ext4_free_blks_set(sb, gdp, 0);
-                        gdp->bg_free_inodes_count = 0;
+                        ext4_free_inodes_set(sb, gdp, 0);
-                        gdp->bg_itable_unused = 0;
+                        ext4_itable_unused_set(sb, gdp, 0);
                        memset(bh->b_data, 0xff, sb->s_blocksize);
                        return 0;
                }
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
                                             ext4_group_t block_group,
                                             struct buffer_head **bh)
 {
-        unsigned long group_desc;
+        unsigned int group_desc;
-        unsigned long offset;
+        unsigned int offset;
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        if (block_group >= sbi->s_groups_count) {
                ext4_error(sb, "ext4_get_group_desc",
                           "block_group >= groups_count - "
-                           "block_group = %lu, groups_count = %lu",
+                           "block_group = %u, groups_count = %u",
                           block_group, sbi->s_groups_count);
                return NULL;
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        if (!sbi->s_group_desc[group_desc]) {
                ext4_error(sb, "ext4_get_group_desc",
                           "Group descriptor not loaded - "
-                           "block_group = %lu, group_desc = %lu, desc = %lu",
+                           "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
        }
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        if (unlikely(!bh)) {
                ext4_error(sb, __func__,
                            "Cannot read block bitmap - "
-                            "block_group = %lu, block_bitmap = %llu",
+                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
-        if (buffer_uptodate(bh) &&
-            !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+        if (bitmap_uptodate(bh))
                return bh;
        lock_buffer(bh);
+        if (bitmap_uptodate(bh)) {
+                unlock_buffer(bh);
+                return bh;
+        }
        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
+                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                unlock_buffer(bh);
                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                unlock_buffer(bh);
                return bh;
        }
        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        if (buffer_uptodate(bh)) {
+                /*
+                 * if not uninit if bh is uptodate,
+                 * bitmap is also uptodate
+                 */
+                set_bitmap_uptodate(bh);
+                unlock_buffer(bh);
+                return bh;
+        }
+        /*
+         * submit the buffer_head for read. We can
+         * safely mark the bitmap as uptodate now.
+         * We do it here so the bitmap uptodate bit
+         * get set with buffer lock held.
+         */
+        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
                ext4_error(sb, __func__,
                            "Cannot read block bitmap - "
-                            "block_group = %lu, block_bitmap = %llu",
+                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 /**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
 * @handle:                     handle to this transaction
 * @sb:                         super block
- * @block:                      start physcial block to free
+ * @block:                      start physcial block to add to the block group
 * @count:                      number of blocks to free
- * @pdquot_freed_blocks:        pointer to quota
 *
- * XXX This function is only used by the on-line resizing code, which
+ * This marks the blocks as free in the bitmap. We ask the
- * should probably be fixed up to call the mballoc variant.  There
+ * mballoc to reload the buddy after this by setting group
- * this needs to be cleaned up later; in fact, I'm not convinced this
+ * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- * is 100% correct in the face of the mballoc code.  The online resizing
- * code needs to be fixed up to more tightly (and correctly) interlock
- * with the mballoc code.
 */
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-                         ext4_fsblk_t block, unsigned long count,
+                         ext4_fsblk_t block, unsigned long count)
-                         unsigned long *pdquot_freed_blocks)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
        ext4_grpblk_t bit;
-        unsigned long i;
+        unsigned int i;
-        unsigned long overflow;
        struct ext4_group_desc *desc;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
-        int err = 0, ret;
+        int err = 0, ret, blk_free_count;
-        ext4_grpblk_t group_freed;
+        ext4_grpblk_t blocks_freed;
+        struct ext4_group_info *grp;
-        *pdquot_freed_blocks = 0;
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
-        if (block < le32_to_cpu(es->s_first_data_block) ||
+        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-            block + count < block ||
-            block + count > ext4_blocks_count(es)) {
-                ext4_error(sb, "ext4_free_blocks",
-                           "Freeing blocks not in datazone - "
-                           "block = %llu, count = %lu", block, count);
-                goto error_return;
-        }
-        ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
-do_more:
-        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+        grp = ext4_get_group_info(sb, block_group);
        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-                overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+                goto error_return;
-                count -= overflow;
        }
-        brelse(bitmap_bh);
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
@@ -418,18 +422,17 @@ do_more:
            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, desc),
                     sbi->s_itb_per_group)) {
-                ext4_error(sb, "ext4_free_blocks",
+                ext4_error(sb, __func__,
-                           "Freeing blocks in system zones - "
+                           "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                goto error_return;
        }
        /*
-         * We are about to start releasing blocks in the bitmap,
+         * We are about to add blocks to the bitmap,
         * so we need undo access.
         */
-        /* @@@ check errors */
        BUFFER_TRACE(bitmap_bh, "getting undo access");
        err = ext4_journal_get_undo_access(handle, bitmap_bh);
        if (err)
@@ -444,107 +447,55 @@ do_more:
        err = ext4_journal_get_write_access(handle, gd_bh);
        if (err)
                goto error_return;
+        /*
-        jbd_lock_bh_state(bitmap_bh);
+         * make sure we don't allow a parallel init on other groups in the
+         * same buddy cache
-        for (i = 0, group_freed = 0; i < count; i++) {
+         */
-                /*
+        down_write(&grp->alloc_sem);
-                 * An HJ special.  This is expensive...
+        for (i = 0, blocks_freed = 0; i < count; i++) {
-                 */
-#ifdef CONFIG_JBD2_DEBUG
-                jbd_unlock_bh_state(bitmap_bh);
-                {
-                        struct buffer_head *debug_bh;
-                        debug_bh = sb_find_get_block(sb, block + i);
-                        if (debug_bh) {
-                                BUFFER_TRACE(debug_bh, "Deleted!");
-                                if (!bh2jh(bitmap_bh)->b_committed_data)
-                                        BUFFER_TRACE(debug_bh,
-                                                "No commited data in bitmap");
-                                BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
-                                __brelse(debug_bh);
-                        }
-                }
-                jbd_lock_bh_state(bitmap_bh);
-#endif
-                if (need_resched()) {
-                        jbd_unlock_bh_state(bitmap_bh);
-                        cond_resched();
-                        jbd_lock_bh_state(bitmap_bh);
-                }
-                /* @@@ This prevents newly-allocated data from being
-                 * freed and then reallocated within the same
-                 * transaction.
-                 *
-                 * Ideally we would want to allow that to happen, but to
-                 * do so requires making jbd2_journal_forget() capable of
-                 * revoking the queued write of a data block, which
-                 * implies blocking on the journal lock.  *forget()
-                 * cannot block due to truncate races.
-                 *
-                 * Eventually we can fix this by making jbd2_journal_forget()
-                 * return a status indicating whether or not it was able
-                 * to revoke the buffer.  On successful revoke, it is
-                 * safe not to set the allocation bit in the committed
-                 * bitmap, because we know that there is no outstanding
-                 * activity on the buffer any more and so it is safe to
-                 * reallocate it.
-                 */
-                BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
-                J_ASSERT_BH(bitmap_bh,
-                                bh2jh(bitmap_bh)->b_committed_data != NULL);
-                ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
-                                bh2jh(bitmap_bh)->b_committed_data);
-                /*
-                 * We clear the bit in the bitmap after setting the committed
-                 * data bit, because this is the reverse order to that which
-                 * the allocator uses.
-                 */
                BUFFER_TRACE(bitmap_bh, "clear bit");
                if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
                                                bit + i, bitmap_bh->b_data)) {
-                        jbd_unlock_bh_state(bitmap_bh);
                        ext4_error(sb, __func__,
                                   "bit already cleared for block %llu",
                                   (ext4_fsblk_t)(block + i));
-                        jbd_lock_bh_state(bitmap_bh);
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
-                        group_freed++;
+                        blocks_freed++;
                }
        }
-        jbd_unlock_bh_state(bitmap_bh);
        spin_lock(sb_bgl_lock(sbi, block_group));
-        le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+        ext4_free_blks_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
        spin_unlock(sb_bgl_lock(sbi, block_group));
-        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
                spin_lock(sb_bgl_lock(sbi, flex_group));
-                sbi->s_flex_groups[flex_group].free_blocks += count;
+                sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
+        /*
+         * request to reload the buddy with the
+         * new bitmap information
+         */
+        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+        ext4_mb_update_group_info(grp, blocks_freed);
+        up_write(&grp->alloc_sem);
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        /* And the group descriptor block */
        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-        ret = ext4_journal_dirty_metadata(handle, gd_bh);
+        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-        if (!err) err = ret;
+        if (!err)
-        *pdquot_freed_blocks += group_freed;
+                err = ret;
-        if (overflow && !err) {
-                block += count;
-                count = overflow;
-                goto do_more;
-        }
        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
                if (dirty_blocks < 0) {
                        printk(KERN_CRIT "Dirty block accounting "
                                        "went wrong %lld\n",
-                                        dirty_blocks);
+                                        (long long)dirty_blocks);
                }
        }
        /* Check whether we have space after
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
        return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
-#define EXT4_META_BLOCK 0x1
-static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, ext4_fsblk_t goal,
-                                unsigned long *count, int *errp, int flags)
-{
-        struct ext4_allocation_request ar;
-        ext4_fsblk_t ret;
-        memset(&ar, 0, sizeof(ar));
-        /* Fill with neighbour allocated blocks */
-        ar.inode = inode;
-        ar.goal = goal;
-        ar.len = *count;
-        ar.logical = iblock;
-        if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
-                /* enable in-core preallocation for data block allocation */
-                ar.flags = EXT4_MB_HINT_DATA;
-        else
-                /* disable in-core preallocation for non-regular files */
-                ar.flags = 0;
-        ret = ext4_mb_new_blocks(handle, &ar, errp);
-        *count = ar.len;
-        return ret;
-}
 /*
 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
 *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
- * @count:              total number of blocks need
+ * @count:              pointer to total number of blocks needed
 * @errp:               error code
 *
- * Return 1st allocated block numberon success, *count stores total account
+ * Return 1st allocated block number on success, *count stores total account
 * error stores in errp pointer
 */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
+        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
-        ret = do_blk_alloc(handle, inode, 0, goal,
-                                count, errp, EXT4_META_BLOCK);
+        memset(&ar, 0, sizeof(ar));
+        /* Fill with neighbour allocated blocks */
+        ar.inode = inode;
+        ar.goal = goal;
+        ar.len = count ? *count : 1;
+        ret = ext4_mb_new_blocks(handle, &ar, errp);
+        if (count)
+                *count = ar.len;
        /*
         * Account for the allocated meta blocks
         */
        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-                EXT4_I(inode)->i_allocated_meta_blocks += *count;
+                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        }
        return ret;
 }
-/*
- * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @errp:               error code
- *
- * Return allocated block number on success
- */
-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-                ext4_fsblk_t goal, int *errp)
-{
-        unsigned long count = 1;
-        return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
-}
-/*
- * ext4_new_blocks() -- allocate data blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @count:              total number of blocks need
- * @errp:               error code
- *
- * Return 1st allocated block numberon success, *count stores total account
- * error stores in errp pointer
- */
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, ext4_fsblk_t goal,
-                                unsigned long *count, int *errp)
-{
-        return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
-}
 /**
 * ext4_count_free_blocks() -- count filesystem free blocks
 * @sb:         superblock
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
-        unsigned long x;
+        unsigned int x;
        struct buffer_head *bitmap_bh = NULL;
        es = EXT4_SB(sb)->s_es;
@@ -789,15 +684,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+                desc_count += ext4_free_blks_count(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (bitmap_bh == NULL)
                        continue;
                x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
+                printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
-                        i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+                        i, ext4_free_blks_count(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+                desc_count += ext4_free_blks_count(sb, gdp);
        }
        return desc_count;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 0a7a6663c190..fa3af81ac565 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,10 +15,9 @@
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
+unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
 {
-        unsigned int i;
+        unsigned int i, sum = 0;
-        unsigned long sum = 0;
        if (!map)
                return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fed5b610df5a..2df2e40b01af 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 int ext4_check_dir_entry(const char *function, struct inode *dir,
                         struct ext4_dir_entry_2 *de,
                         struct buffer_head *bh,
-                         unsigned long offset)
+                         unsigned int offset)
 {
        const char *error_msg = NULL;
        const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
        if (error_msg != NULL)
                ext4_error(dir->i_sb, function,
                        "bad entry in directory #%lu: %s - "
-                        "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+                        "offset=%u, inode=%u, rec_len=%d, name_len=%d",
                        dir->i_ino, error_msg, offset,
-                        (unsigned long) le32_to_cpu(de->inode),
+                        le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
 }
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
                         void *dirent, filldir_t filldir)
 {
        int error = 0;
-        unsigned long offset;
+        unsigned int offset;
        int i, stored;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
        sb = inode->i_sb;
        if (!fname) {
-                printk(KERN_ERR "ext4: call_filldir: called with "
+                printk(KERN_ERR "EXT4-fs: call_filldir: called with "
                       "null fname?!?\n");
                return 0;
        }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0537c827024..aafc9eba1c25 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -19,6 +19,7 @@
 #include <linux/types.h>
 #include <linux/blkdev.h>
 #include <linux/magic.h>
+#include <linux/jbd2.h>
 #include "ext4_i.h"
 /*
@@ -94,9 +95,9 @@ struct ext4_allocation_request {
        /* phys. block for ^^^ */
        ext4_fsblk_t pright;
        /* how many blocks we want to allocate */
-        unsigned long len;
+        unsigned int len;
        /* flags. see above EXT4_MB_HINT_* */
-        unsigned long flags;
+        unsigned int flags;
 };
 /*
@@ -156,12 +157,12 @@ struct ext4_group_desc
        __le32  bg_block_bitmap_lo;     /* Blocks bitmap block */
        __le32  bg_inode_bitmap_lo;     /* Inodes bitmap block */
        __le32  bg_inode_table_lo;      /* Inodes table block */
-        __le16  bg_free_blocks_count;   /* Free blocks count */
+        __le16  bg_free_blocks_count_lo;/* Free blocks count */
-        __le16  bg_free_inodes_count;   /* Free inodes count */
+        __le16  bg_free_inodes_count_lo;/* Free inodes count */
-        __le16  bg_used_dirs_count;     /* Directories count */
+        __le16  bg_used_dirs_count_lo;  /* Directories count */
        __le16  bg_flags;               /* EXT4_BG_flags (INODE_UNINIT, etc) */
        __u32   bg_reserved[2];         /* Likely block/inode bitmap checksum */
-        __le16  bg_itable_unused;       /* Unused inodes count */
+        __le16  bg_itable_unused_lo;    /* Unused inodes count */
        __le16  bg_checksum;            /* crc16(sb_uuid+group+desc) */
        __le32  bg_block_bitmap_hi;     /* Blocks bitmap block MSB */
        __le32  bg_inode_bitmap_hi;     /* Inodes bitmap block MSB */
@@ -169,7 +170,7 @@ struct ext4_group_desc
        __le16  bg_free_blocks_count_hi;/* Free blocks count MSB */
        __le16  bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16  bg_used_dirs_count_hi;  /* Directories count MSB */
-        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
+        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
        __u32   bg_reserved2[3];
 };
@@ -328,6 +329,7 @@ struct ext4_mount_options {
        uid_t s_resuid;
        gid_t s_resgid;
        unsigned long s_commit_interval;
+        u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
        int s_jquota_fmt;
        char *s_qf_names[MAXQUOTAS];
@@ -534,7 +536,6 @@ do {									       \
 #define EXT4_MOUNT_QUOTA                0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA             0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA             0x200000 /* "old" group quota */
-#define EXT4_MOUNT_EXTENTS              0x400000 /* Extents support */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 */
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)                        \
-        (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
+        ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)                     \
-        (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
+        ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)                      \
-        (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
+        ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)                        \
        EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)                     \
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEFM_JMODE_WBACK   0x0060
 /*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME 0
+#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+/*
 * Structure of a directory entry
 */
 #define EXT4_NAME_LEN 255
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
 #define DX_HASH_LEGACY          0
 #define DX_HASH_HALF_MD4        1
 #define DX_HASH_TEA             2
+#define DX_HASH_LEGACY_UNSIGNED 3
+#define DX_HASH_HALF_MD4_UNSIGNED       4
+#define DX_HASH_TEA_UNSIGNED            5
 #ifdef __KERNEL__
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 #define ERR_BAD_DX_DIR  -75000
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-                        unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+                        ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
 extern struct proc_dir_entry *ext4_proc_root;
@@ -987,6 +997,9 @@ do {									\
 # define ATTRIB_NORET   __attribute__((noreturn))
 # define NORET_AND      noreturn,
+/* bitmap.c */
+extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
 /* balloc.c */
 extern unsigned int ext4_block_group(struct super_block *sb,
                        ext4_fsblk_t blocknr);
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
-extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, int *errp);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-                                        ext4_lblk_t iblock, ext4_fsblk_t goal,
-                                        unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-                                ext4_fsblk_t block, unsigned long count,
+                                ext4_fsblk_t block, unsigned long count);
-                                unsigned long *pdquot_freed_blocks);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
                                struct ext4_dir_entry_2 *,
-                                struct buffer_head *, unsigned long);
+                                struct buffer_head *, unsigned int);
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
                unsigned long, unsigned long, int, unsigned long *);
-extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
                ext4_grpblk_t add);
+extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
+                                                ext4_group_t, int);
 /* inode.c */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                struct buffer_head *bh, ext4_fsblk_t blocknr);
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                                struct buffer_head *bh_result, int create);
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, unsigned long maxblocks,
-                                struct buffer_head *bh_result,
-                                int create, int extend_disksize);
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, int);
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
+                                const char *, const char *, ...)
+        __attribute__ ((format (printf, 4, 5)));
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
+extern __u32 ext4_free_blks_count(struct super_block *sb,
+                                struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+                                 struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+                                struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+                                   struct ext4_group_desc *bg);
 extern void ext4_block_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_blks_set(struct super_block *sb,
+                               struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+                                struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+                                struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+                                   struct ext4_group_desc *bg, __u32 count);
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -1184,8 +1206,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
 static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
 {
-        return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+        if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
-                le32_to_cpu(raw_inode->i_size_lo);
+                return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+                        le32_to_cpu(raw_inode->i_size_lo);
+        else
+                return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
 }
 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
@@ -1225,11 +1250,11 @@ do {								\
 } while (0)
 #ifdef CONFIG_SMP
-/* Each CPU can accumulate FBC_BATCH blocks in their local
+/* Each CPU can accumulate percpu_counter_batch blocks in their local
 * counters. So we need to make sure we have free blocks more
- * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
 #define EXT4_FREEBLOCKS_WATERMARK 0
 #endif
@@ -1246,6 +1271,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
        return ;
 }
+struct ext4_group_info {
+        unsigned long   bb_state;
+        struct rb_root  bb_free_root;
+        unsigned short  bb_first_free;
+        unsigned short  bb_free;
+        unsigned short  bb_fragments;
+        struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+        void            *bb_bitmap;
+#endif
+        struct rw_semaphore alloc_sem;
+        unsigned short  bb_counters[];
+};
+#define EXT4_GROUP_INFO_NEED_INIT_BIT   0
+#define EXT4_GROUP_INFO_LOCKED_BIT      1
+#define EXT4_MB_GRP_NEED_INIT(grp)      \
+        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+static inline void ext4_unlock_group(struct super_block *sb,
+                                        ext4_group_t group)
+{
+        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+static inline int ext4_is_group_locked(struct super_block *sb,
+                                        ext4_group_t group)
+{
+        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+                                                &(grinfo->bb_state));
+}
 /*
 * Inodes and files operations
 */
@@ -1271,18 +1340,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock,
+                               ext4_lblk_t iblock, unsigned int max_blocks,
-                        unsigned long max_blocks, struct buffer_head *bh_result,
+                               struct buffer_head *bh_result,
-                        int create, int extend_disksize);
+                               int create, int extend_disksize);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
-                        sector_t block, unsigned long max_blocks,
+                        sector_t block, unsigned int max_blocks,
                        struct buffer_head *bh, int create,
                        int extend_disksize, int flag);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                        __u64 start, __u64 len);
+/*
+ * Add new method to test wether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+        return (buffer_uptodate(bh) &&
+                        test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bec7ce59fc0d..18cb67b2cbbc 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
        return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
 }
-static inline void ext4_ext_tree_changed(struct inode *inode)
-{
-        EXT4_I(inode)->i_ext_generation++;
-}
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 5c124c0ac6d3..e69acc16f5c4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
 typedef __u32 ext4_lblk_t;
 /* data type for block group number */
-typedef unsigned long ext4_group_t;
+typedef unsigned int ext4_group_t;
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
@@ -100,9 +100,6 @@ struct ext4_inode_info {
         */
        loff_t  i_disksize;
-        /* on-disk additional length */
-        __u16 i_extra_isize;
        /*
         * i_data_sem is for serialising ext4_truncate() against
         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
@@ -117,7 +114,6 @@ struct ext4_inode_info {
        struct inode vfs_inode;
        struct jbd2_inode jinode;
-        unsigned long i_ext_generation;
        struct ext4_ext_cache i_cached_extent;
        /*
         * File creation time. Its function is same as that of
@@ -130,10 +126,14 @@ struct ext4_inode_info {
        spinlock_t i_prealloc_lock;
        /* allocation reservation info for delalloc */
-        unsigned long i_reserved_data_blocks;
+        unsigned int i_reserved_data_blocks;
-        unsigned long i_reserved_meta_blocks;
+        unsigned int i_reserved_meta_blocks;
-        unsigned long i_allocated_meta_blocks;
+        unsigned int i_allocated_meta_blocks;
        unsigned short i_delalloc_reserved_flag;
+        /* on-disk additional length */
+        __u16 i_extra_isize;
        spinlock_t i_block_reservation_lock;
 };
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index c75384b34f2c..ad13a84644e1 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,53 +7,96 @@
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
-        int err = jbd2_journal_get_undo_access(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_get_undo_access(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
-        int err = jbd2_journal_get_write_access(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_get_write_access(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
 int __ext4_journal_forget(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
-        int err = jbd2_journal_forget(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_forget(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
 int __ext4_journal_revoke(const char *where, handle_t *handle,
                                ext4_fsblk_t blocknr, struct buffer_head *bh)
 {
-        int err = jbd2_journal_revoke(handle, blocknr, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_revoke(handle, blocknr, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
 int __ext4_journal_get_create_access(const char *where,
                                handle_t *handle, struct buffer_head *bh)
 {
-        int err = jbd2_journal_get_create_access(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_get_create_access(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
-int __ext4_journal_dirty_metadata(const char *where,
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
-                                handle_t *handle, struct buffer_head *bh)
+                                 struct inode *inode, struct buffer_head *bh)
 {
-        int err = jbd2_journal_dirty_metadata(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_dirty_metadata(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        } else {
+                mark_buffer_dirty(bh);
+                if (inode && inode_needs_sync(inode)) {
+                        sync_dirty_buffer(bh);
+                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                                ext4_error(inode->i_sb, __func__,
+                                           "IO error syncing inode, "
+                                           "inode=%lu, block=%llu",
+                                           inode->i_ino,
+                                           (unsigned long long) bh->b_blocknr);
+                                err = -EIO;
+                        }
+                }
+        }
        return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b455c685a98b..be2f426f6805 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -32,8 +32,8 @@
 * 5 levels of tree + root which are stored in the inode. */
 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                                \
-        (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
+        (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
-                || test_opt(sb, EXTENTS) ? 27U : 8U)
+         ? 27U : 8U)
 /* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 * been done yet.
 */
-static inline void ext4_journal_release_buffer(handle_t *handle,
-                                                struct buffer_head *bh)
-{
-        jbd2_journal_release_buffer(handle, bh);
-}
 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
 int __ext4_journal_get_create_access(const char *where,
                                handle_t *handle, struct buffer_head *bh);
-int __ext4_journal_dirty_metadata(const char *where,
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
-                                handle_t *handle, struct buffer_head *bh);
+                                 struct inode *inode, struct buffer_head *bh);
 #define ext4_journal_get_undo_access(handle, bh) \
        __ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
        __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
        __ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_dirty_metadata(handle, bh) \
-        __ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
        __ext4_journal_forget(__func__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+        __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
+#define EXT4_NOJOURNAL_HANDLE   ((handle_t *) 0x1)
+static inline int ext4_handle_valid(handle_t *handle)
+{
+        if (handle == EXT4_NOJOURNAL_HANDLE)
+                return 0;
+        return 1;
+}
+static inline void ext4_handle_sync(handle_t *handle)
+{
+        if (ext4_handle_valid(handle))
+                handle->h_sync = 1;
+}
+static inline void ext4_handle_release_buffer(handle_t *handle,
+                                                struct buffer_head *bh)
+{
+        if (ext4_handle_valid(handle))
+                jbd2_journal_release_buffer(handle, bh);
+}
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+        if (ext4_handle_valid(handle))
+                return is_handle_aborted(handle);
+        return 0;
+}
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+        if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+                return 0;
+        return 1;
+}
+static inline void ext4_journal_release_buffer(handle_t *handle,
+                                                struct buffer_head *bh)
+{
+        if (ext4_handle_valid(handle))
+                jbd2_journal_release_buffer(handle, bh);
+}
 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 {
        return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
 static inline int ext4_journal_extend(handle_t *handle, int nblocks)
 {
-        return jbd2_journal_extend(handle, nblocks);
+        if (ext4_handle_valid(handle))
+                return jbd2_journal_extend(handle, nblocks);
+        return 0;
 }
 static inline int ext4_journal_restart(handle_t *handle, int nblocks)
 {
-        return jbd2_journal_restart(handle, nblocks);
+        if (ext4_handle_valid(handle))
+                return jbd2_journal_restart(handle, nblocks);
+        return 0;
 }
 static inline int ext4_journal_blocks_per_page(struct inode *inode)
 {
-        return jbd2_journal_blocks_per_page(inode);
+        if (EXT4_JOURNAL(inode) != NULL)
+                return jbd2_journal_blocks_per_page(inode);
+        return 0;
 }
 static inline int ext4_journal_force_commit(journal_t *journal)
 {
-        return jbd2_journal_force_commit(journal);
+        if (journal)
+                return jbd2_journal_force_commit(journal);
+        return 0;
 }
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-        return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+        if (ext4_handle_valid(handle))
+                return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+        return 0;
 }
 /* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
 static inline int ext4_should_journal_data(struct inode *inode)
 {
+        if (EXT4_JOURNAL(inode) == NULL)
+                return 0;
        if (!S_ISREG(inode->i_mode))
                return 1;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
 static inline int ext4_should_order_data(struct inode *inode)
 {
+        if (EXT4_JOURNAL(inode) == NULL)
+                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
+        if (EXT4_JOURNAL(inode) == NULL)
+                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 445fde603df8..039b6ea1a042 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -57,6 +57,7 @@ struct ext4_sb_info {
        u32 s_next_generation;
        u32 s_hash_seed[4];
        int s_def_hash_version;
+        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
        struct percpu_counter s_freeblocks_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
@@ -73,6 +74,8 @@ struct ext4_sb_info {
        struct journal_s *s_journal;
        struct list_head s_orphan;
        unsigned long s_commit_interval;
+        u32 s_max_batch_time;
+        u32 s_min_batch_time;
        struct block_device *journal_bdev;
 #ifdef CONFIG_JBD2_DEBUG
        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
@@ -101,7 +104,8 @@ struct ext4_sb_info {
        spinlock_t s_reserve_lock;
        spinlock_t s_md_lock;
        tid_t s_last_transaction;
-        unsigned short *s_mb_offsets, *s_mb_maxs;
+        unsigned short *s_mb_offsets;
+        unsigned int *s_mb_maxs;
        /* tunables */
        unsigned long s_stripe;
@@ -146,4 +150,10 @@ struct ext4_sb_info {
        struct flex_groups *s_flex_groups;
 };
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+        return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+}
 #endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae66..e2eab196875f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
        int err;
+        if (!ext4_handle_valid(handle))
+                return 0;
        if (handle->h_buffer_credits > needed)
                return 0;
        err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
        int err;
        if (path->p_bh) {
                /* path points to block */
-                err = ext4_journal_dirty_metadata(handle, path->p_bh);
+                err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
        } else {
                /* path points to leaf/index in inode body */
                err = ext4_mark_inode_dirty(handle, inode);
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
        ext4_fsblk_t goal, newblock;
        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-        newblock = ext4_new_meta_block(handle, inode, goal, err);
+        newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
        return newblock;
 }
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
-        err = ext4_journal_dirty_metadata(handle, bh);
+        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto cleanup;
        brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                err = ext4_journal_dirty_metadata(handle, bh);
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (err)
                        goto cleanup;
                brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
-        err = ext4_journal_dirty_metadata(handle, bh);
+        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto out;
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        while (--depth >= 0) {
                ix = path[depth].p_idx;
                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
-                        break;
+                        goto got_index;
        }
-        if (depth < 0) {
+        /* we've gone up to the root and found no index to the right */
-                /* we've gone up to the root and
+        return 0;
-                 * found no index to the right */
-                return 0;
-        }
+got_index:
        /* we've found index to the right, let's
         * follow it and find the closest allocated
         * block to the right */
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        *phys = ext_pblock(ex);
        put_bh(bh);
        return 0;
 }
 /*
@@ -1622,7 +1621,6 @@ cleanup:
                ext4_ext_drop_refs(npath);
                kfree(npath);
        }
-        ext4_ext_tree_changed(inode);
        ext4_ext_invalidate_cache(inode);
        return err;
 }
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                }
        }
 out:
-        ext4_ext_tree_changed(inode);
        ext4_ext_drop_refs(path);
        kfree(path);
        ext4_journal_stop(handle);
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
         * possible initialization would be here
         */
-        if (test_opt(sb, EXTENTS)) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
                printk(", aggressive tests");
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
 */
 void ext4_ext_release(struct super_block *sb)
 {
-        if (!test_opt(sb, EXTENTS))
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
                return;
 #ifdef EXTENTS_STATS
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_ext_path *path,
                                                ext4_lblk_t iblock,
-                                                unsigned long max_blocks)
+                                                unsigned int max_blocks)
 {
        struct ext4_extent *ex, newex, orig_ex;
        struct ext4_extent *ex1 = NULL;
@@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 */
                newdepth = ext_depth(inode);
                /*
-                 * update the extent length after successfull insert of the
+                 * update the extent length after successful insert of the
                 * split extent
                 */
                orig_ex.ee_len = cpu_to_le16(ee_len -
@@ -2678,26 +2675,26 @@ fix_extent_len:
 */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
-                        unsigned long max_blocks, struct buffer_head *bh_result,
+                        unsigned int max_blocks, struct buffer_head *bh_result,
                        int create, int extend_disksize)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex;
-        ext4_fsblk_t goal, newblock;
+        ext4_fsblk_t newblock;
-        int err = 0, depth, ret;
+        int err = 0, depth, ret, cache_type;
-        unsigned long allocated = 0;
+        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        loff_t disksize;
        __clear_bit(BH_New, &bh_result->b_state);
-        ext_debug("blocks %u/%lu requested for inode %u\n",
+        ext_debug("blocks %u/%u requested for inode %u\n",
                        iblock, max_blocks, inode->i_ino);
        /* check in cache */
-        goal = ext4_ext_in_cache(inode, iblock, &newex);
+        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
-        if (goal) {
+        if (cache_type) {
-                if (goal == EXT4_EXT_CACHE_GAP) {
+                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if (!create) {
                                /*
                                 * block isn't allocated yet and
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                goto out2;
                        }
                        /* we should allocate requested block */
-                } else if (goal == EXT4_EXT_CACHE_EXTENT) {
+                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
                        /* block is already allocated */
                        newblock = iblock
                                   - le32_to_cpu(newex.ee_block)
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        if (!newblock)
                goto out2;
        ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
-                        goal, newblock, allocated);
+                  ar.goal, newblock, allocated);
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode)
         * transaction synchronous.
         */
        if (IS_SYNC(inode))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
 out_stop:
        up_write(&EXT4_I(inode)->i_data_sem);
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        handle_t *handle;
        ext4_lblk_t block;
        loff_t new_size;
-        unsigned long max_blocks;
+        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
@@ -3051,7 +3048,7 @@ retry:
                        WARN_ON(ret <= 0);
                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
                                    "returned error inode#%lu, block=%u, "
-                                    "max_blocks=%lu", __func__,
+                                    "max_blocks=%u", __func__,
                                    inode->i_ino, block, max_blocks);
 #endif
                        ext4_mark_inode_dirty(handle, inode);
@@ -3083,7 +3080,7 @@ retry:
 /*
 * Callback function called for each extent to gather FIEMAP information.
 */
-int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
                       void *data)
 {
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+static int ext4_xattr_fiemap(struct inode *inode,
+                                struct fiemap_extent_info *fieinfo)
 {
        __u64 physical = 0;
        __u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fba71f7..f731cb545a03 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-                __u64 start, __u64 len);
 const struct file_operations ext4_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 556ca8eba3db..ac8f168c8ab4 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 /* The old legacy hash */
-static __u32 dx_hack_hash(const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-        __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        const unsigned char *ucp = (const unsigned char *) name;
+        while (len--) {
+                hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+                if (hash & 0x80000000)
+                        hash -= 0x7fffffff;
+                hash1 = hash0;
+                hash0 = hash;
+        }
+        return hash0 << 1;
+}
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        const signed char *scp = (const signed char *) name;
        while (len--) {
-                __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+                hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
-                if (hash & 0x80000000) hash -= 0x7fffffff;
+                if (hash & 0x80000000)
+                        hash -= 0x7fffffff;
                hash1 = hash0;
                hash0 = hash;
        }
-        return (hash0 << 1);
+        return hash0 << 1;
+}
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+        __u32   pad, val;
+        int     i;
+        const signed char *scp = (const signed char *) msg;
+        pad = (__u32)len | ((__u32)len << 8);
+        pad |= pad << 16;
+        val = pad;
+        if (len > num*4)
+                len = num * 4;
+        for (i = 0; i < len; i++) {
+                if ((i % 4) == 0)
+                        val = pad;
+                val = ((int) scp[i]) + (val << 8);
+                if ((i % 4) == 3) {
+                        *buf++ = val;
+                        val = pad;
+                        num--;
+                }
+        }
+        if (--num >= 0)
+                *buf++ = val;
+        while (--num >= 0)
+                *buf++ = pad;
 }
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 {
        __u32   pad, val;
        int     i;
+        const unsigned char *ucp = (const unsigned char *) msg;
        pad = (__u32)len | ((__u32)len << 8);
        pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
        for (i = 0; i < len; i++) {
                if ((i % 4) == 0)
                        val = pad;
-                val = msg[i] + (val << 8);
+                val = ((int) ucp[i]) + (val << 8);
                if ((i % 4) == 3) {
                        *buf++ = val;
                        val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        const char      *p;
        int             i;
        __u32           in[8], buf[4];
+        void            (*str2hashbuf)(const char *, int, __u32 *, int) =
+                                str2hashbuf_signed;
        /* Initialize the default seed for the hash checksum functions */
        buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        }
        switch (hinfo->hash_version) {
+        case DX_HASH_LEGACY_UNSIGNED:
+                hash = dx_hack_hash_unsigned(name, len);
+                break;
        case DX_HASH_LEGACY:
-                hash = dx_hack_hash(name, len);
+                hash = dx_hack_hash_signed(name, len);
                break;
+        case DX_HASH_HALF_MD4_UNSIGNED:
+                str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_HALF_MD4:
                p = name;
                while (len > 0) {
-                        str2hashbuf(p, len, in, 8);
+                        (*str2hashbuf)(p, len, in, 8);
                        half_md4_transform(buf, in);
                        len -= 32;
                        p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
                minor_hash = buf[2];
                hash = buf[1];
                break;
+        case DX_HASH_TEA_UNSIGNED:
+                str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_TEA:
                p = name;
                while (len > 0) {
-                        str2hashbuf(p, len, in, 4);
+                        (*str2hashbuf)(p, len, in, 4);
                        TEA_transform(buf, in);
                        len -= 16;
                        p += 16;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 08cac9fcace2..4fb86a0061d0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        /* If checksum is bad mark all blocks and inodes use to prevent
         * allocation, essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-                ext4_error(sb, __func__, "Checksum bad for group %lu\n",
+                ext4_error(sb, __func__, "Checksum bad for group %u",
                           block_group);
-                gdp->bg_free_blocks_count = 0;
+                ext4_free_blks_set(sb, gdp, 0);
-                gdp->bg_free_inodes_count = 0;
+                ext4_free_inodes_set(sb, gdp, 0);
-                gdp->bg_itable_unused = 0;
+                ext4_itable_unused_set(sb, gdp, 0);
                memset(bh->b_data, 0xff, sb->s_blocksize);
                return 0;
        }
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
        return EXT4_INODES_PER_GROUP(sb);
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        if (unlikely(!bh)) {
                ext4_error(sb, __func__,
                            "Cannot read inode bitmap - "
-                            "block_group = %lu, inode_bitmap = %llu",
+                            "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
-        if (buffer_uptodate(bh) &&
+        if (bitmap_uptodate(bh))
-            !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
                return bh;
        lock_buffer(bh);
+        if (bitmap_uptodate(bh)) {
+                unlock_buffer(bh);
+                return bh;
+        }
        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
+                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                unlock_buffer(bh);
                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                unlock_buffer(bh);
                return bh;
        }
        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        if (buffer_uptodate(bh)) {
+                /*
+                 * if not uninit if bh is uptodate,
+                 * bitmap is also uptodate
+                 */
+                set_bitmap_uptodate(bh);
+                unlock_buffer(bh);
+                return bh;
+        }
+        /*
+         * submit the buffer_head for read. We can
+         * safely mark the bitmap as uptodate now.
+         * We do it here so the bitmap uptodate bit
+         * get set with buffer lock held.
+         */
+        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
                ext4_error(sb, __func__,
                            "Cannot read inode bitmap - "
-                            "block_group = %lu, inode_bitmap = %llu",
+                            "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
-        int fatal = 0, err;
+        int fatal = 0, err, count;
        ext4_group_t flex_group;
        if (atomic_read(&inode->i_count) > 1) {
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        ino = inode->i_ino;
        ext4_debug("freeing inode %lu\n", ino);
+        trace_mark(ext4_free_inode,
+                   "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
+                   sb->s_id, inode->i_ino, inode->i_mode,
+                   (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
+                   (unsigned long long) inode->i_blocks);
        /*
         * Note: we must free any quota before locking the superblock,
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                if (gdp) {
                        spin_lock(sb_bgl_lock(sbi, block_group));
-                        le16_add_cpu(&gdp->bg_free_inodes_count, 1);
+                        count = ext4_free_inodes_count(sb, gdp) + 1;
-                        if (is_directory)
+                        ext4_free_inodes_set(sb, gdp, count);
-                                le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+                        if (is_directory) {
+                                count = ext4_used_dirs_count(sb, gdp) - 1;
+                                ext4_used_dirs_set(sb, gdp, count);
+                        }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
                        spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                                spin_unlock(sb_bgl_lock(sbi, flex_group));
                        }
                }
-                BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-                err = ext4_journal_dirty_metadata(handle, bh2);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh2);
                if (!fatal) fatal = err;
        }
-        BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
+        BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
-        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        if (!fatal)
                fatal = err;
        sb->s_dirt = 1;
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
        for (group = 0; group < ngroups; group++) {
                desc = ext4_get_group_desc(sb, group, NULL);
-                if (!desc || !desc->bg_free_inodes_count)
+                if (!desc || !ext4_free_inodes_count(sb, desc))
                        continue;
-                if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+                if (ext4_free_inodes_count(sb, desc) < avefreei)
                        continue;
                if (!best_desc ||
-                    (le16_to_cpu(desc->bg_free_blocks_count) >
+                    (ext4_free_blks_count(sb, desc) >
-                     le16_to_cpu(best_desc->bg_free_blocks_count))) {
+                     ext4_free_blks_count(sb, best_desc))) {
                        *best_group = group;
                        best_desc = desc;
                        ret = 0;
@@ -369,7 +397,7 @@ found_flexbg:
        for (i = best_flex * flex_size; i < ngroups &&
                     i < (best_flex + 1) * flex_size; i++) {
                desc = ext4_get_group_desc(sb, i, &bh);
-                if (le16_to_cpu(desc->bg_free_inodes_count)) {
+                if (ext4_free_inodes_count(sb, desc)) {
                        *best_group = i;
                        goto out;
                }
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                for (i = 0; i < ngroups; i++) {
                        grp = (parent_group + i) % ngroups;
                        desc = ext4_get_group_desc(sb, grp, NULL);
-                        if (!desc || !desc->bg_free_inodes_count)
+                        if (!desc || !ext4_free_inodes_count(sb, desc))
                                continue;
-                        if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+                        if (ext4_used_dirs_count(sb, desc) >= best_ndir)
                                continue;
-                        if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+                        if (ext4_free_inodes_count(sb, desc) < avefreei)
                                continue;
-                        if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+                        if (ext4_free_blks_count(sb, desc) < avefreeb)
                                continue;
                        *group = grp;
                        ret = 0;
-                        best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+                        best_ndir = ext4_used_dirs_count(sb, desc);
                }
                if (ret == 0)
                        return ret;
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        for (i = 0; i < ngroups; i++) {
                *group = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
-                if (!desc || !desc->bg_free_inodes_count)
+                if (!desc || !ext4_free_inodes_count(sb, desc))
                        continue;
-                if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+                if (ext4_used_dirs_count(sb, desc) >= max_dirs)
                        continue;
-                if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+                if (ext4_free_inodes_count(sb, desc) < min_inodes)
                        continue;
-                if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+                if (ext4_free_blks_count(sb, desc) < min_blocks)
                        continue;
                return 0;
        }
@@ -494,8 +522,8 @@ fallback:
        for (i = 0; i < ngroups; i++) {
                *group = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
-                if (desc && desc->bg_free_inodes_count &&
+                if (desc && ext4_free_inodes_count(sb, desc) &&
-                        le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+                        ext4_free_inodes_count(sb, desc) >= avefreei)
                        return 0;
        }
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
         */
        *group = parent_group;
        desc = ext4_get_group_desc(sb, *group, NULL);
-        if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+        if (desc && ext4_free_inodes_count(sb, desc) &&
-                        le16_to_cpu(desc->bg_free_blocks_count))
+                        ext4_free_blks_count(sb, desc))
                return 0;
        /*
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                if (*group >= ngroups)
                        *group -= ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
-                if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+                if (desc && ext4_free_inodes_count(sb, desc) &&
-                                le16_to_cpu(desc->bg_free_blocks_count))
+                                ext4_free_blks_count(sb, desc))
                        return 0;
        }
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                if (++*group >= ngroups)
                        *group = 0;
                desc = ext4_get_group_desc(sb, *group, NULL);
-                if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+                if (desc && ext4_free_inodes_count(sb, desc))
                        return 0;
        }
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 }
 /*
+ * claim the inode from the inode bitmap. If the group
+ * is uninit we need to take the groups's sb_bgl_lock
+ * and clear the uninit flag. The inode bitmap update
+ * and group desc uninit flag clear should be done
+ * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * doesn't race with the ext4_claim_inode
+ */
+static int ext4_claim_inode(struct super_block *sb,
+                        struct buffer_head *inode_bitmap_bh,
+                        unsigned long ino, ext4_group_t group, int mode)
+{
+        int free = 0, retval = 0, count;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+        spin_lock(sb_bgl_lock(sbi, group));
+        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+                /* not a free inode */
+                retval = 1;
+                goto err_ret;
+        }
+        ino++;
+        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+                        ino > EXT4_INODES_PER_GROUP(sb)) {
+                spin_unlock(sb_bgl_lock(sbi, group));
+                ext4_error(sb, __func__,
+                           "reserved inode or inode > inodes count - "
+                           "block_group = %u, inode=%lu", group,
+                           ino + group * EXT4_INODES_PER_GROUP(sb));
+                return 1;
+        }
+        /* If we didn't allocate from within the initialized part of the inode
+         * table then we need to initialize up to this inode. */
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+                        /* When marking the block group with
+                         * ~EXT4_BG_INODE_UNINIT we don't want to depend
+                         * on the value of bg_itable_unused even though
+                         * mke2fs could have initialized the same for us.
+                         * Instead we calculated the value below
+                         */
+                        free = 0;
+                } else {
+                        free = EXT4_INODES_PER_GROUP(sb) -
+                                ext4_itable_unused_count(sb, gdp);
+                }
+                /*
+                 * Check the relative inode number against the last used
+                 * relative inode number in this group. if it is greater
+                 * we need to  update the bg_itable_unused count
+                 *
+                 */
+                if (ino > free)
+                        ext4_itable_unused_set(sb, gdp,
+                                        (EXT4_INODES_PER_GROUP(sb) - ino));
+        }
+        count = ext4_free_inodes_count(sb, gdp) - 1;
+        ext4_free_inodes_set(sb, gdp, count);
+        if (S_ISDIR(mode)) {
+                count = ext4_used_dirs_count(sb, gdp) + 1;
+                ext4_used_dirs_set(sb, gdp, count);
+        }
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+        spin_unlock(sb_bgl_lock(sbi, group));
+        return retval;
+}
+/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 {
        struct super_block *sb;
-        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *inode_bitmap_bh = NULL;
-        struct buffer_head *bh2;
+        struct buffer_head *group_desc_bh;
        ext4_group_t group = 0;
        unsigned long ino = 0;
        struct inode *inode;
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                return ERR_PTR(-EPERM);
        sb = dir->i_sb;
+        trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
+                   dir->i_ino, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
@@ -631,40 +734,52 @@ got_group:
        for (i = 0; i < sbi->s_groups_count; i++) {
                err = -EIO;
-                gdp = ext4_get_group_desc(sb, group, &bh2);
+                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
                if (!gdp)
                        goto fail;
-                brelse(bitmap_bh);
+                brelse(inode_bitmap_bh);
-                bitmap_bh = ext4_read_inode_bitmap(sb, group);
+                inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
-                if (!bitmap_bh)
+                if (!inode_bitmap_bh)
                        goto fail;
                ino = 0;
 repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
-                                bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+                                              inode_bitmap_bh->b_data,
+                                              EXT4_INODES_PER_GROUP(sb), ino);
                if (ino < EXT4_INODES_PER_GROUP(sb)) {
-                        BUFFER_TRACE(bitmap_bh, "get_write_access");
+                        BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
-                        err = ext4_journal_get_write_access(handle, bitmap_bh);
+                        err = ext4_journal_get_write_access(handle,
+                                                            inode_bitmap_bh);
                        if (err)
                                goto fail;
-                        if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
+                        BUFFER_TRACE(group_desc_bh, "get_write_access");
-                                                ino, bitmap_bh->b_data)) {
+                        err = ext4_journal_get_write_access(handle,
+                                                                group_desc_bh);
+                        if (err)
+                                goto fail;
+                        if (!ext4_claim_inode(sb, inode_bitmap_bh,
+                                                ino, group, mode)) {
                                /* we won it */
-                                BUFFER_TRACE(bitmap_bh,
+                                BUFFER_TRACE(inode_bitmap_bh,
-                                        "call ext4_journal_dirty_metadata");
+                                        "call ext4_handle_dirty_metadata");
-                                err = ext4_journal_dirty_metadata(handle,
+                                err = ext4_handle_dirty_metadata(handle,
-                                                                bitmap_bh);
+                                                                 inode,
+                                                        inode_bitmap_bh);
                                if (err)
                                        goto fail;
+                                /* zero bit is inode number 1*/
+                                ino++;
                                goto got;
                        }
                        /* we lost it */
-                        jbd2_journal_release_buffer(handle, bitmap_bh);
+                        ext4_handle_release_buffer(handle, inode_bitmap_bh);
+                        ext4_handle_release_buffer(handle, group_desc_bh);
                        if (++ino < EXT4_INODES_PER_GROUP(sb))
                                goto repeat_in_this_group;
@@ -684,30 +799,16 @@ repeat_in_this_group:
        goto out;
 got:
-        ino++;
-        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-            ino > EXT4_INODES_PER_GROUP(sb)) {
-                ext4_error(sb, __func__,
-                           "reserved inode or inode > inodes count - "
-                           "block_group = %lu, inode=%lu", group,
-                           ino + group * EXT4_INODES_PER_GROUP(sb));
-                err = -EIO;
-                goto fail;
-        }
-        BUFFER_TRACE(bh2, "get_write_access");
-        err = ext4_journal_get_write_access(handle, bh2);
-        if (err) goto fail;
        /* We may have to initialize the block bitmap if it isn't already */
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
+                struct buffer_head *block_bitmap_bh;
-                BUFFER_TRACE(block_bh, "get block bitmap access");
+                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
-                err = ext4_journal_get_write_access(handle, block_bh);
+                BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+                err = ext4_journal_get_write_access(handle, block_bitmap_bh);
                if (err) {
-                        brelse(block_bh);
+                        brelse(block_bitmap_bh);
                        goto fail;
                }
@@ -715,9 +816,9 @@ got:
                spin_lock(sb_bgl_lock(sbi, group));
                /* recheck and clear flag under lock if we still need to */
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        free = ext4_free_blocks_after_init(sb, group, gdp);
-                        gdp->bg_free_blocks_count = cpu_to_le16(free);
+                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+                        ext4_free_blks_set(sb, gdp, free);
                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
                                                                gdp);
                }
@@ -725,55 +826,19 @@ got:
                /* Don't need to dirty bitmap block if we didn't change it */
                if (free) {
-                        BUFFER_TRACE(block_bh, "dirty block bitmap");
+                        BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
-                        err = ext4_journal_dirty_metadata(handle, block_bh);
+                        err = ext4_handle_dirty_metadata(handle,
+                                                        NULL, block_bitmap_bh);
                }
-                brelse(block_bh);
+                brelse(block_bitmap_bh);
                if (err)
                        goto fail;
        }
+        BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
-        spin_lock(sb_bgl_lock(sbi, group));
+        err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
-        /* If we didn't allocate from within the initialized part of the inode
+        if (err)
-         * table then we need to initialize up to this inode. */
+                goto fail;
-        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-                        /* When marking the block group with
-                         * ~EXT4_BG_INODE_UNINIT we don't want to depend
-                         * on the value of bg_itable_unused even though
-                         * mke2fs could have initialized the same for us.
-                         * Instead we calculated the value below
-                         */
-                        free = 0;
-                } else {
-                        free = EXT4_INODES_PER_GROUP(sb) -
-                                le16_to_cpu(gdp->bg_itable_unused);
-                }
-                /*
-                 * Check the relative inode number against the last used
-                 * relative inode number in this group. if it is greater
-                 * we need to  update the bg_itable_unused count
-                 *
-                 */
-                if (ino > free)
-                        gdp->bg_itable_unused =
-                                cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
-        }
-        le16_add_cpu(&gdp->bg_free_inodes_count, -1);
-        if (S_ISDIR(mode)) {
-                le16_add_cpu(&gdp->bg_used_dirs_count, 1);
-        }
-        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-        spin_unlock(sb_bgl_lock(sbi, group));
-        BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-        err = ext4_journal_dirty_metadata(handle, bh2);
-        if (err) goto fail;
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
@@ -825,8 +890,11 @@ got:
        ext4_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EINVAL;
+                goto fail_drop;
+        }
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
@@ -849,7 +917,7 @@ got:
        if (err)
                goto fail_free_drop;
-        if (test_opt(sb, EXTENTS)) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
@@ -864,6 +932,8 @@ got:
        }
        ext4_debug("allocating inode %lu\n", inode->i_ino);
+        trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
+                   sb->s_id, inode->i_ino, dir->i_ino, mode);
        goto really_out;
 fail:
        ext4_std_error(sb, err);
@@ -871,7 +941,7 @@ out:
        iput(inode);
        ret = ERR_PTR(err);
 really_out:
-        brelse(bitmap_bh);
+        brelse(inode_bitmap_bh);
        return ret;
 fail_free_drop:
@@ -881,8 +951,9 @@ fail_drop:
        DQUOT_DROP(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
+        unlock_new_inode(inode);
        iput(inode);
-        brelse(bitmap_bh);
+        brelse(inode_bitmap_bh);
        return ERR_PTR(err);
 }
@@ -981,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+                desc_count += ext4_free_inodes_count(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_inode_bitmap(sb, i);
                if (!bitmap_bh)
@@ -989,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
                x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-                        i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+                        i, ext4_free_inodes_count(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
@@ -1003,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+                desc_count += ext4_free_inodes_count(sb, gdp);
                cond_resched();
        }
        return desc_count;
@@ -1020,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                count += le16_to_cpu(gdp->bg_used_dirs_count);
+                count += ext4_used_dirs_count(sb, gdp);
        }
        return count;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be21a5ae33cb..03ba20be1329 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -34,6 +34,7 @@
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
+#include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
@@ -71,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 * "bh" may be NULL: a metadata block may have been freed from memory
 * but there may still be a record of it in the journal, and that record
 * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling so there's nothing to do.
 */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                        struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
        int err;
+        if (!ext4_handle_valid(handle))
+                return 0;
        might_sleep();
        BUFFER_TRACE(bh, "enter");
@@ -169,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
 */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
-        if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+        if (!ext4_handle_valid(handle))
+                return 0;
+        if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
                return 0;
        if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
                return 0;
@@ -183,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 */
 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 {
+        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
        return ext4_journal_restart(handle, blocks_for_truncate(inode));
 }
@@ -215,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
        }
        if (IS_SYNC(inode))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
@@ -232,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
         * enough credits left in the handle to remove the inode from
         * the orphan list and set the dtime field.
         */
-        if (handle->h_buffer_credits < 3) {
+        if (!ext4_handle_has_enough_credits(handle, 3)) {
                err = ext4_journal_extend(handle, 3);
                if (err > 0)
                        err = ext4_journal_restart(handle, 3);
@@ -351,9 +360,9 @@ static int ext4_block_to_path(struct inode *inode,
                final = ptrs;
        } else {
                ext4_warning(inode->i_sb, "ext4_block_to_path",
-                                "block %lu > max",
+                                "block %lu > max in inode %lu",
                                i_block + direct_blocks +
-                                indirect_blocks + double_blocks);
+                                indirect_blocks + double_blocks, inode->i_ino);
        }
        if (boundary)
                *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -505,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 *      return the total number of blocks to be allocate, including the
 *      direct and indirect blocks.
 */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
                int blocks_to_boundary)
 {
-        unsigned long count = 0;
+        unsigned int count = 0;
        /*
         * Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -546,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                                int indirect_blks, int blks,
                                ext4_fsblk_t new_blocks[4], int *err)
 {
+        struct ext4_allocation_request ar;
        int target, i;
        unsigned long count = 0, blk_allocated = 0;
        int index = 0;
@@ -594,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        if (!target)
                goto allocated;
        /* Now allocate data blocks */
-        count = target;
+        memset(&ar, 0, sizeof(ar));
-        /* allocating blocks for data blocks */
+        ar.inode = inode;
-        current_block = ext4_new_blocks(handle, inode, iblock,
+        ar.goal = goal;
-                                                goal, &count, err);
+        ar.len = target;
+        ar.logical = iblock;
+        if (S_ISREG(inode->i_mode))
+                /* enable in-core preallocation only for regular files */
+                ar.flags = EXT4_MB_HINT_DATA;
+        current_block = ext4_mb_new_blocks(handle, &ar, err);
        if (*err && (target == blks)) {
                /*
                 * if the allocation failed and we didn't allocate
@@ -613,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                 */
                        new_blocks[index] = current_block;
                }
-                blk_allocated += count;
+                blk_allocated += ar.len;
        }
 allocated:
        /* total number of blocks allocated for direct blocks */
@@ -708,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                err = ext4_journal_dirty_metadata(handle, bh);
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (err)
                        goto failed;
        }
@@ -791,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
                 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
                 */
                jbd_debug(5, "splicing indirect only\n");
-                BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
-                err = ext4_journal_dirty_metadata(handle, where->bh);
+                err = ext4_handle_dirty_metadata(handle, inode, where->bh);
                if (err)
                        goto err_out;
        } else {
@@ -839,10 +856,10 @@ err_out:
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
 */
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-                ext4_lblk_t iblock, unsigned long maxblocks,
+                                  ext4_lblk_t iblock, unsigned int maxblocks,
-                struct buffer_head *bh_result,
+                                  struct buffer_head *bh_result,
-                int create, int extend_disksize)
+                                  int create, int extend_disksize)
 {
        int err = -EIO;
        ext4_lblk_t offsets[4];
@@ -1044,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 * It returns the error in case of allocation failure.
 */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-                        unsigned long max_blocks, struct buffer_head *bh,
+                        unsigned int max_blocks, struct buffer_head *bh,
                        int create, int extend_disksize, int flag)
 {
        int retval;
@@ -1220,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                set_buffer_uptodate(bh);
                        }
                        unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        err = ext4_journal_dirty_metadata(handle, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
                        if (!fatal)
                                fatal = err;
                } else {
@@ -1334,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
        pgoff_t index;
        unsigned from, to;
+        trace_mark(ext4_write_begin,
+                   "dev %s ino %lu pos %llu len %u flags %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, flags);
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1345,7 +1366,7 @@ retry:
                goto out;
        }
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                ext4_journal_stop(handle);
                ret = -ENOMEM;
@@ -1386,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        set_buffer_uptodate(bh);
-        return ext4_journal_dirty_metadata(handle, bh);
+        return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 /*
@@ -1405,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
+        trace_mark(ext4_ordered_write_end,
+                   "dev %s ino %lu pos %llu len %u copied %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, copied);
        ret = ext4_jbd2_file_inode(handle, inode);
        if (ret == 0) {
@@ -1443,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
        int ret = 0, ret2;
        loff_t new_i_size;
+        trace_mark(ext4_writeback_write_end,
+                   "dev %s ino %lu pos %llu len %u copied %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, copied);
        new_i_size = pos + copied;
        if (new_i_size > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, new_i_size);
@@ -1478,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
        unsigned from, to;
        loff_t new_i_size;
+        trace_mark(ext4_journalled_write_end,
+                   "dev %s ino %lu pos %llu len %u copied %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, copied);
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1624,7 +1657,7 @@ struct mpage_da_data {
        get_block_t *get_block;
        struct writeback_control *wbc;
        int io_done;
-        long pages_written;
+        int pages_written;
        int retval;
 };
@@ -1644,35 +1677,39 @@ struct mpage_da_data {
 */
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
-        struct address_space *mapping = mpd->inode->i_mapping;
-        int ret = 0, err, nr_pages, i;
-        unsigned long index, end;
-        struct pagevec pvec;
        long pages_skipped;
+        struct pagevec pvec;
+        unsigned long index, end;
+        int ret = 0, err, nr_pages, i;
+        struct inode *inode = mpd->inode;
+        struct address_space *mapping = inode->i_mapping;
        BUG_ON(mpd->next_page <= mpd->first_page);
-        pagevec_init(&pvec, 0);
+        /*
+         * We need to start from the first_page to the next_page - 1
+         * to make sure we also write the mapped dirty buffer_heads.
+         * If we look at mpd->lbh.b_blocknr we would only be looking
+         * at the currently mapped buffer_heads.
+         */
        index = mpd->first_page;
        end = mpd->next_page - 1;
+        pagevec_init(&pvec, 0);
        while (index <= end) {
-                /*
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                 * We can use PAGECACHE_TAG_DIRTY lookup here because
-                 * even though we have cleared the dirty flag on the page
-                 * We still keep the page in the radix tree with tag
-                 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
-                 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
-                 * which is called via the below writepage callback.
-                 */
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                                        PAGECACHE_TAG_DIRTY,
-                                        min(end - index,
-                                        (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        BUG_ON(!PageLocked(page));
+                        BUG_ON(PageWriteback(page));
                        pages_skipped = mpd->wbc->pages_skipped;
                        err = mapping->a_ops->writepage(page, mpd->wbc);
                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -1830,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode)
                        ext4_count_free_blocks(inode->i_sb));
        printk(KERN_EMERG "Free/Dirty block details\n");
        printk(KERN_EMERG "free_blocks=%lld\n",
-                        percpu_counter_sum(&sbi->s_freeblocks_counter));
+                        (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
        printk(KERN_EMERG "dirty_blocks=%lld\n",
-                        percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+                        (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
        printk(KERN_EMERG "Block reservation details\n");
-        printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+        printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
                        EXT4_I(inode)->i_reserved_data_blocks);
-        printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+        printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
                        EXT4_I(inode)->i_reserved_meta_blocks);
        return;
 }
@@ -2086,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page,
                bh = head;
                do {
                        BUG_ON(buffer_locked(bh));
+                        /*
+                         * We need to try to allocate
+                         * unmapped blocks in the same page.
+                         * Otherwise we won't make progress
+                         * with the page in ext4_da_writepage
+                         */
                        if (buffer_dirty(bh) &&
                                (!buffer_mapped(bh) || buffer_delay(bh))) {
                                mpage_add_bh_to_extent(mpd, logical, bh);
                                if (mpd->io_done)
                                        return MPAGE_DA_EXTENT_TAIL;
+                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+                                /*
+                                 * mapped dirty buffer. We need to update
+                                 * the b_state because we look at
+                                 * b_state in mpage_da_map_blocks. We don't
+                                 * update b_size because if we find an
+                                 * unmapped buffer_head later we need to
+                                 * use the b_state flag of that buffer_head.
+                                 */
+                                if (mpd->lbh.b_size == 0)
+                                        mpd->lbh.b_state =
+                                                bh->b_state & BH_FLAGS;
                        }
                        logical++;
                } while ((bh = bh->b_this_page) != head);
@@ -2268,10 +2323,13 @@ static int ext4_da_writepage(struct page *page,
 {
        int ret = 0;
        loff_t size;
-        unsigned long len;
+        unsigned int len;
        struct buffer_head *page_bufs;
        struct inode *inode = page->mapping->host;
+        trace_mark(ext4_da_writepage,
+                   "dev %s ino %lu page_index %lu",
+                   inode->i_sb->s_id, inode->i_ino, page->index);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2377,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
        int no_nrwrite_index_update;
-        long pages_written = 0, pages_skipped;
+        int pages_written = 0;
+        long pages_skipped;
        int needed_blocks, ret = 0, nr_to_writebump = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+        trace_mark(ext4_da_writepages,
+                   "dev %s ino %lu nr_t_write %ld "
+                   "pages_skipped %ld range_start %llu "
+                   "range_end %llu nonblocking %d "
+                   "for_kupdate %d for_reclaim %d "
+                   "for_writepages %d range_cyclic %d",
+                   inode->i_sb->s_id, inode->i_ino,
+                   wbc->nr_to_write, wbc->pages_skipped,
+                   (unsigned long long) wbc->range_start,
+                   (unsigned long long) wbc->range_end,
+                   wbc->nonblocking, wbc->for_kupdate,
+                   wbc->for_reclaim, wbc->for_writepages,
+                   wbc->range_cyclic);
        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
@@ -2388,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping,
         */
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;
+        /*
+         * If the filesystem has aborted, it is read-only, so return
+         * right away instead of dumping stack traces later on that
+         * will obscure the real source of the problem.  We test
+         * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+         * the latter could be true if the filesystem is mounted
+         * read-only, and in that case, ext4_da_writepages should
+         * *never* be called, so if that ever happens, we would want
+         * the stack trace.
+         */
+        if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+                return -EROFS;
        /*
         * Make sure nr_to_write is >= sbi->s_mb_stream_request
         * This make sure small files blocks are allocated in
@@ -2432,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping,
                handle = ext4_journal_start(inode, needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                        printk(KERN_EMERG "%s: jbd2_start: "
+                        printk(KERN_CRIT "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d\n", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        dump_stack();
@@ -2485,6 +2572,14 @@ out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
+        trace_mark(ext4_da_writepage_result,
+                   "dev %s ino %lu ret %d pages_written %d "
+                   "pages_skipped %ld congestion %d "
+                   "more_io %d no_nrwrite_index_update %d",
+                   inode->i_sb->s_id, inode->i_ino, ret,
+                   pages_written, wbc->pages_skipped,
+                   wbc->encountered_congestion, wbc->more_io,
+                   wbc->no_nrwrite_index_update);
        return ret;
 }
@@ -2497,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb)
        /*
         * switch to non delalloc mode if we are running low
         * on free block. The free block accounting via percpu
-         * counters can get slightly wrong with FBC_BATCH getting
+         * counters can get slightly wrong with percpu_counter_batch getting
         * accumulated on each CPU without updating global counters
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
@@ -2536,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                        len, flags, pagep, fsdata);
        }
        *fsdata = (void *)0;
+        trace_mark(ext4_da_write_begin,
+                   "dev %s ino %lu pos %llu len %u flags %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, flags);
 retry:
        /*
         * With delayed allocation, we don't log the i_disksize update
@@ -2549,7 +2649,7 @@ retry:
                goto out;
        }
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                ext4_journal_stop(handle);
                ret = -ENOMEM;
@@ -2625,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
                }
        }
+        trace_mark(ext4_da_write_end,
+                   "dev %s ino %lu pos %llu len %u copied %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, copied);
        start = pos & (PAGE_CACHE_SIZE - 1);
        end = start + copied - 1;
@@ -2717,7 +2821,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                filemap_write_and_wait(mapping);
        }
-        if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+        if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
                 * bmap on dirty files is expected to be extremely rare:
@@ -2835,6 +2939,9 @@ static int ext4_normal_writepage(struct page *page,
        loff_t size = i_size_read(inode);
        loff_t len;
+        trace_mark(ext4_normal_writepage,
+                   "dev %s ino %lu page_index %lu",
+                   inode->i_sb->s_id, inode->i_ino, page->index);
        J_ASSERT(PageLocked(page));
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2920,6 +3027,9 @@ static int ext4_journalled_writepage(struct page *page,
        loff_t size = i_size_read(inode);
        loff_t len;
+        trace_mark(ext4_journalled_writepage,
+                   "dev %s ino %lu page_index %lu",
+                   inode->i_sb->s_id, inode->i_ino, page->index);
        J_ASSERT(PageLocked(page));
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2988,7 +3098,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
        if (offset == 0)
                ClearPageChecked(page);
-        jbd2_journal_invalidatepage(journal, page, offset);
+        if (journal)
+                jbd2_journal_invalidatepage(journal, page, offset);
+        else
+                block_invalidatepage(page, offset);
 }
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2998,7 +3111,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
-        return jbd2_journal_try_to_free_buffers(journal, page, wait);
+        if (journal)
+                return jbd2_journal_try_to_free_buffers(journal, page, wait);
+        else
+                return try_to_free_buffers(page);
 }
 /*
@@ -3270,7 +3386,7 @@ int ext4_block_truncate_page(handle_t *handle,
        err = 0;
        if (ext4_should_journal_data(inode)) {
-                err = ext4_journal_dirty_metadata(handle, bh);
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
                if (ext4_should_order_data(inode))
                        err = ext4_jbd2_file_inode(handle, inode);
@@ -3394,8 +3510,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
        __le32 *p;
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
-                        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_journal_dirty_metadata(handle, bh);
+                        ext4_handle_dirty_metadata(handle, inode, bh);
                }
                ext4_mark_inode_dirty(handle, inode);
                ext4_journal_test_restart(handle, inode);
@@ -3495,7 +3611,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                                  count, block_to_free_p, p);
        if (this_bh) {
-                BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
                /*
                 * The buffer head should have an attached journal head at this
@@ -3503,8 +3619,8 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                 * block pointed to itself, it would have been detached when
                 * the block was cleared. Check for this instead of OOPSing.
                 */
-                if (bh2jh(this_bh))
+                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
-                        ext4_journal_dirty_metadata(handle, this_bh);
+                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
                        ext4_error(inode->i_sb, __func__,
                                   "circular indirect block detected, "
@@ -3534,7 +3650,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
        ext4_fsblk_t nr;
        __le32 *p;
-        if (is_handle_aborted(handle))
+        if (ext4_handle_is_aborted(handle))
                return;
        if (depth--) {
@@ -3604,7 +3720,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * will merely complain about releasing a free block,
                         * rather than leaking blocks.
                         */
-                        if (is_handle_aborted(handle))
+                        if (ext4_handle_is_aborted(handle))
                                return;
                        if (try_to_extend_transaction(handle, inode)) {
                                ext4_mark_inode_dirty(handle, inode);
@@ -3623,9 +3739,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                                                   parent_bh)){
                                        *p = 0;
                                        BUFFER_TRACE(parent_bh,
-                                        "call ext4_journal_dirty_metadata");
+                                        "call ext4_handle_dirty_metadata");
-                                        ext4_journal_dirty_metadata(handle,
+                                        ext4_handle_dirty_metadata(handle,
-                                                                    parent_bh);
+                                                                   inode,
+                                                                   parent_bh);
                                }
                        }
                }
@@ -3813,7 +3930,7 @@ do_indirects:
         * synchronous
         */
        if (IS_SYNC(inode))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
 out_stop:
        /*
         * If this was a simple ftruncate(), and the file will remain alive
@@ -3843,7 +3960,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
        ext4_fsblk_t            block;
        int                     inodes_per_block, inode_offset;
-        iloc->bh = 0;
+        iloc->bh = NULL;
        if (!ext4_valid_inum(sb, inode->i_ino))
                return -EIO;
@@ -3950,7 +4067,7 @@ make_io:
                        num = EXT4_INODES_PER_GROUP(sb);
                        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
-                                num -= le16_to_cpu(gdp->bg_itable_unused);
+                                num -= ext4_itable_unused_count(sb, gdp);
                        table += num / inodes_per_block;
                        if (end > table)
                                end = table;
@@ -4164,9 +4281,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                inode->i_op = &ext4_dir_inode_operations;
                inode->i_fop = &ext4_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext4_inode_is_fast_symlink(inode))
+                if (ext4_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext4_fast_symlink_inode_operations;
-                else {
+                        nd_terminate_link(ei->i_data, inode->i_size,
+                                sizeof(ei->i_data) - 1);
+                } else {
                        inode->i_op = &ext4_symlink_inode_operations;
                        ext4_set_aops(inode);
                }
@@ -4310,8 +4429,8 @@ static int ext4_do_update_inode(handle_t *handle,
                        EXT4_SET_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
                        sb->s_dirt = 1;
-                        handle->h_sync = 1;
+                        ext4_handle_sync(handle);
-                        err = ext4_journal_dirty_metadata(handle,
+                        err = ext4_handle_dirty_metadata(handle, inode,
                                        EXT4_SB(sb)->s_sbh);
                }
        }
@@ -4338,9 +4457,8 @@ static int ext4_do_update_inode(handle_t *handle,
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
        }
+        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+        rc = ext4_handle_dirty_metadata(handle, inode, bh);
-        rc = ext4_journal_dirty_metadata(handle, bh);
        if (!err)
                err = rc;
        ei->i_state &= ~EXT4_STATE_NEW;
@@ -4403,6 +4521,25 @@ int ext4_write_inode(struct inode *inode, int wait)
        return ext4_force_commit(inode->i_sb);
 }
+int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
+{
+        int err = 0;
+        mark_buffer_dirty(bh);
+        if (inode && inode_needs_sync(inode)) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        ext4_error(inode->i_sb, __func__,
+                                   "IO error syncing inode, "
+                                   "inode=%lu, block=%llu",
+                                   inode->i_ino,
+                                   (unsigned long long)bh->b_blocknr);
+                        err = -EIO;
+                }
+        }
+        return err;
+}
 /*
 * ext4_setattr()
 *
@@ -4707,16 +4844,15 @@ int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                         struct ext4_iloc *iloc)
 {
-        int err = 0;
+        int err;
-        if (handle) {
-                err = ext4_get_inode_loc(inode, iloc);
+        err = ext4_get_inode_loc(inode, iloc);
-                if (!err) {
+        if (!err) {
-                        BUFFER_TRACE(iloc->bh, "get_write_access");
+                BUFFER_TRACE(iloc->bh, "get_write_access");
-                        err = ext4_journal_get_write_access(handle, iloc->bh);
+                err = ext4_journal_get_write_access(handle, iloc->bh);
-                        if (err) {
+                if (err) {
-                                brelse(iloc->bh);
+                        brelse(iloc->bh);
-                                iloc->bh = NULL;
+                        iloc->bh = NULL;
-                        }
                }
        }
        ext4_std_error(inode->i_sb, err);
@@ -4788,7 +4924,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        might_sleep();
        err = ext4_reserve_inode_write(handle, inode, &iloc);
-        if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+        if (ext4_handle_valid(handle) &&
+            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
            !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
                /*
                 * We need extra buffer credits since we may write into EA block
@@ -4840,6 +4977,11 @@ void ext4_dirty_inode(struct inode *inode)
        handle_t *current_handle = ext4_journal_current_handle();
        handle_t *handle;
+        if (!ext4_handle_valid(current_handle)) {
+                ext4_mark_inode_dirty(current_handle, inode);
+                return;
+        }
        handle = ext4_journal_start(inode, 2);
        if (IS_ERR(handle))
                goto out;
@@ -4877,8 +5019,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
                        BUFFER_TRACE(iloc.bh, "get_write_access");
                        err = jbd2_journal_get_write_access(handle, iloc.bh);
                        if (!err)
-                                err = ext4_journal_dirty_metadata(handle,
+                                err = ext4_handle_dirty_metadata(handle,
-                                                                  iloc.bh);
+                                                                 inode,
+                                                                 iloc.bh);
                        brelse(iloc.bh);
                }
        }
@@ -4904,6 +5047,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         */
        journal = EXT4_JOURNAL(inode);
+        if (!journal)
+                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;
@@ -4933,7 +5078,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                return PTR_ERR(handle);
        err = ext4_mark_inode_dirty(handle, inode);
-        handle->h_sync = 1;
+        ext4_handle_sync(handle);
        ext4_journal_stop(handle);
        ext4_std_error(inode->i_sb, err);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dc99b4776d58..42dc83fb247a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        goto flags_out;
                }
                if (IS_SYNC(inode))
-                        handle->h_sync = 1;
+                        ext4_handle_sync(handle);
                err = ext4_reserve_inode_write(handle, inode, &iloc);
                if (err)
                        goto flags_err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad998f72e..deba54f6cbed 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
 * inode as:
 *
 *  {                        page                        }
- *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.  So for each group we
@@ -330,6 +330,18 @@
 *        object
 *
 */
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                        ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+                                                ext4_group_t group);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr += first + i;
                        blocknr +=
                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+                        ext4_grp_locked_error(sb, e4b->bd_group,
-                        ext4_error(sb, __func__, "double-free of inode"
+                                   __func__, "double-free of inode"
-                                   " %lu's block %llu(bit %u in group %lu)\n",
+                                   " %lu's block %llu(bit %u in group %u)",
                                   inode ? inode->i_ino : 0, blocknr,
                                   first + i, e4b->bd_group);
                }
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
                b2 = (unsigned char *) bitmap;
                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
                        if (b1[i] != b2[i]) {
-                                printk(KERN_ERR "corruption in group %lu "
+                                printk(KERN_ERR "corruption in group %u "
                                       "at byte %u(%u): %x in copy != %x "
                                       "on disk/prealloc\n",
                                       e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
        grp->bb_fragments = fragments;
        if (free != grp->bb_free) {
-                ext4_error(sb, __func__,
+                ext4_grp_locked_error(sb, group,  __func__,
-                        "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+                        "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
                        group, free, grp->bb_free);
                /*
                 * If we intent to continue, we consider group descritor
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 * stored in the inode as
 *
 * {                        page                        }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (bh[i] == NULL)
                        goto out;
-                if (buffer_uptodate(bh[i]) &&
+                if (bitmap_uptodate(bh[i]))
-                    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
                        continue;
                lock_buffer(bh[i]);
+                if (bitmap_uptodate(bh[i])) {
+                        unlock_buffer(bh[i]);
+                        continue;
+                }
                spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        ext4_init_block_bitmap(sb, bh[i],
                                                first_group + i, desc);
+                        set_bitmap_uptodate(bh[i]);
                        set_buffer_uptodate(bh[i]);
-                        unlock_buffer(bh[i]);
                        spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                        unlock_buffer(bh[i]);
                        continue;
                }
                spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                if (buffer_uptodate(bh[i])) {
+                        /*
+                         * if not uninit if bh is uptodate,
+                         * bitmap is also uptodate
+                         */
+                        set_bitmap_uptodate(bh[i]);
+                        unlock_buffer(bh[i]);
+                        continue;
+                }
                get_bh(bh[i]);
+                /*
+                 * submit the buffer_head for read. We can
+                 * safely mark the bitmap as uptodate now.
+                 * We do it here so the bitmap uptodate bit
+                 * get set with buffer lock held.
+                 */
+                set_bitmap_uptodate(bh[i]);
                bh[i]->b_end_io = end_buffer_read_sync;
                submit_bh(READ, bh[i]);
-                mb_debug("read bitmap for group %lu\n", first_group + i);
+                mb_debug("read bitmap for group %u\n", first_group + i);
        }
        /* wait for I/O completion */
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        err = 0;
        first_block = page->index * blocks_per_page;
+        /* init the page  */
+        memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
        for (i = 0; i < blocks_per_page; i++) {
                int group;
                struct ext4_group_info *grinfo;
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore == NULL);
                        mb_debug("put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
-                        memset(data, 0xff, blocksize);
                        grinfo = ext4_get_group_info(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        /*
                         * incore got set to the group block bitmap below
                         */
+                        ext4_lock_group(sb, group);
                        ext4_mb_generate_buddy(sb, data, incore, group);
+                        ext4_unlock_group(sb, group);
                        incore = NULL;
                } else {
                        /* this is block of bitmap */
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        /* mark all preallocated blks used in in-core bitmap */
                        ext4_mb_generate_from_pa(sb, data, group);
+                        ext4_mb_generate_from_freelist(sb, data, group);
                        ext4_unlock_group(sb, group);
                        /* set incore so that the buddy information can be
@@ -886,18 +922,20 @@ static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                                        struct ext4_buddy *e4b)
 {
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct inode *inode = sbi->s_buddy_cache;
        int blocks_per_page;
        int block;
        int pnum;
        int poff;
        struct page *page;
        int ret;
+        struct ext4_group_info *grp;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct inode *inode = sbi->s_buddy_cache;
-        mb_debug("load group %lu\n", group);
+        mb_debug("load group %u\n", group);
        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        grp = ext4_get_group_info(sb, group);
        e4b->bd_blkbits = sb->s_blocksize_bits;
        e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        e4b->bd_group = group;
        e4b->bd_buddy_page = NULL;
        e4b->bd_bitmap_page = NULL;
+        e4b->alloc_semp = &grp->alloc_sem;
+        /* Take the read lock on the group alloc
+         * sem. This would make sure a parallel
+         * ext4_mb_init_group happening on other
+         * groups mapped by the page is blocked
+         * till we are done with allocation
+         */
+        down_read(e4b->alloc_semp);
        /*
         * the buddy cache inode stores the block bitmap
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        page = find_get_page(inode->i_mapping, pnum);
        if (page == NULL || !PageUptodate(page)) {
                if (page)
+                        /*
+                         * drop the page reference and try
+                         * to get the page with lock. If we
+                         * are not uptodate that implies
+                         * somebody just created the page but
+                         * is yet to initialize the same. So
+                         * wait for it to initialize.
+                         */
                        page_cache_release(page);
                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
                if (page) {
@@ -985,6 +1040,9 @@ err:
                page_cache_release(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
+        /* Done with the buddy cache */
+        up_read(e4b->alloc_semp);
        return ret;
 }
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                page_cache_release(e4b->bd_buddy_page);
+        /* Done with the buddy cache */
+        if (e4b->alloc_semp)
+                up_read(e4b->alloc_semp);
 }
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                mb_clear_bit_atomic(lock, cur, bm);
+                if (lock)
+                        mb_clear_bit_atomic(lock, cur, bm);
+                else
+                        mb_clear_bit(cur, bm);
                cur++;
        }
 }
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                mb_set_bit_atomic(lock, cur, bm);
+                if (lock)
+                        mb_set_bit_atomic(lock, cur, bm);
+                else
+                        mb_set_bit(cur, bm);
                cur++;
        }
 }
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr += block;
                        blocknr +=
                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-                        ext4_unlock_group(sb, e4b->bd_group);
+                        ext4_grp_locked_error(sb, e4b->bd_group,
-                        ext4_error(sb, __func__, "double-free of inode"
+                                   __func__, "double-free of inode"
-                                   " %lu's block %llu(bit %u in group %lu)\n",
+                                   " %lu's block %llu(bit %u in group %u)",
                                   inode ? inode->i_ino : 0, blocknr, block,
                                   e4b->bd_group);
-                        ext4_lock_group(sb, e4b->bd_group);
                }
                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
                e4b->bd_info->bb_counters[order]++;
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
        ac->ac_tail = ret & 0xffff;
        ac->ac_buddy = ret >> 16;
-        /* XXXXXXX: SUCH A HORRIBLE **CK */
+        /*
-        /*FIXME!! Why ? */
+         * take the page reference. We want the page to be pinned
+         * so that we don't get a ext4_mb_init_cache_call for this
+         * group until we update the bitmap. That would mean we
+         * double allocate blocks. The reference is dropped
+         * in ext4_mb_release_context
+         */
        ac->ac_bitmap_page = e4b->bd_bitmap_page;
        get_page(ac->ac_bitmap_page);
        ac->ac_buddy_page = e4b->bd_buddy_page;
        get_page(ac->ac_buddy_page);
+        /* on allocation we use ac to track the held semaphore */
+        ac->alloc_semp =  e4b->alloc_semp;
+        e4b->alloc_semp = NULL;
        /* store last allocated for subsequent stream allocation */
        if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
                spin_lock(&sbi->s_md_lock);
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
        struct ext4_free_extent ex;
        int max;
+        if (ac->ac_status == AC_STATUS_FOUND)
+                return;
        /*
         * We don't want to scan for a whole year
         */
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                         * free blocks even though group info says we
                         * we have free blocks
                         */
-                        ext4_error(sb, __func__, "%d free blocks as per "
+                        ext4_grp_locked_error(sb, e4b->bd_group,
-                                        "group info. But bitmap says 0\n",
+                                        __func__, "%d free blocks as per "
+                                        "group info. But bitmap says 0",
                                        free);
                        break;
                }
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
                BUG_ON(ex.fe_len <= 0);
                if (free < ex.fe_len) {
-                        ext4_error(sb, __func__, "%d free blocks as per "
+                        ext4_grp_locked_error(sb, e4b->bd_group,
-                                        "group info. But got %d blocks\n",
+                                        __func__, "%d free blocks as per "
+                                        "group info. But got %d blocks",
                                        free, ex.fe_len);
                        /*
                         * The number of free blocks differs. This mostly
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        return 0;
 }
+/*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        int groups_per_page;
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        groups_per_page = blocks_per_page >> 1;
+        if (groups_per_page == 0)
+                groups_per_page = 1;
+        /* read all groups the page covers into the cache */
+        for (i = 0; i < groups_per_page; i++) {
+                if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+                        break;
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                down_write_nested(&grp->alloc_sem, i);
+        }
+        return i;
+}
+void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+                                        ext4_group_t group, int locked_group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        /* release locks on all the groups */
+        for (i = 0; i < locked_group; i++) {
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                up_write(&grp->alloc_sem);
+        }
+}
+static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+        int ret;
+        void *bitmap;
+        int blocks_per_page;
+        int block, pnum, poff;
+        int num_grp_locked = 0;
+        struct ext4_group_info *this_grp;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct inode *inode = sbi->s_buddy_cache;
+        struct page *page = NULL, *bitmap_page = NULL;
+        mb_debug("init group %lu\n", group);
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        this_grp = ext4_get_group_info(sb, group);
+        /*
+         * This ensures we don't add group
+         * to this buddy cache via resize
+         */
+        num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+        if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+                /*
+                 * somebody initialized the group
+                 * return without doing anything
+                 */
+                ret = 0;
+                goto err;
+        }
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (page) {
+                BUG_ON(page->mapping != inode->i_mapping);
+                ret = ext4_mb_init_cache(page, NULL);
+                if (ret) {
+                        unlock_page(page);
+                        goto err;
+                }
+                unlock_page(page);
+        }
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
+                goto err;
+        }
+        mark_page_accessed(page);
+        bitmap_page = page;
+        bitmap = page_address(page) + (poff * sb->s_blocksize);
+        /* init buddy cache */
+        block++;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (page == bitmap_page) {
+                /*
+                 * If both the bitmap and buddy are in
+                 * the same page we don't need to force
+                 * init the buddy
+                 */
+                unlock_page(page);
+        } else if (page) {
+                BUG_ON(page->mapping != inode->i_mapping);
+                ret = ext4_mb_init_cache(page, bitmap);
+                if (ret) {
+                        unlock_page(page);
+                        goto err;
+                }
+                unlock_page(page);
+        }
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
+                goto err;
+        }
+        mark_page_accessed(page);
+err:
+        ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+        if (bitmap_page)
+                page_cache_release(bitmap_page);
+        if (page)
+                page_cache_release(page);
+        return ret;
+}
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -1775,7 +2019,7 @@ repeat:
                                group = 0;
                        /* quick check to skip empty groups */
-                        grp = ext4_get_group_info(ac->ac_sb, group);
+                        grp = ext4_get_group_info(sb, group);
                        if (grp->bb_free == 0)
                                continue;
@@ -1788,10 +2032,9 @@ repeat:
                                 * we need full data about the group
                                 * to make a good selection
                                 */
-                                err = ext4_mb_load_buddy(sb, group, &e4b);
+                                err = ext4_mb_init_group(sb, group);
                                if (err)
                                        goto out;
-                                ext4_mb_release_desc(&e4b);
                        }
                        /*
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
        if (hs->op == EXT4_MB_HISTORY_ALLOC) {
                fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
                        "%-5u %-5s %-5u %-6u\n";
-                sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+                sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len,
                        hs->result.fe_logical);
-                sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+                sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
                        hs->orig.fe_start, hs->orig.fe_len,
                        hs->orig.fe_logical);
-                sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
+                sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
                        hs->goal.fe_start, hs->goal.fe_len,
                        hs->goal.fe_logical);
                seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
                                hs->buddy ? 1 << hs->buddy : 0);
        } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
                fmt = "%-5u %-8u %-23s %-23s %-23s\n";
-                sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+                sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len,
                        hs->result.fe_logical);
-                sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+                sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
                        hs->orig.fe_start, hs->orig.fe_len,
                        hs->orig.fe_logical);
                seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
        } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
-                sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+                sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len);
                seq_printf(seq, "%-5u %-8u %-23s discard\n",
                                hs->pid, hs->ino, buf2);
        } else if (hs->op == EXT4_MB_HISTORY_FREE) {
-                sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+                sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len);
                seq_printf(seq, "%-5u %-8u %-23s free\n",
                                hs->pid, hs->ino, buf2);
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
                return NULL;
        group = *pos + 1;
-        return (void *) group;
+        return (void *) ((unsigned long) group);
 }
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
        if (*pos < 0 || *pos >= sbi->s_groups_count)
                return NULL;
        group = *pos + 1;
-        return (void *) group;;
+        return (void *) ((unsigned long) group);
 }
 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 {
        struct super_block *sb = seq->private;
-        long group = (long) v;
+        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
        int i;
        int err;
        struct ext4_buddy e4b;
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
                sizeof(struct ext4_group_info);
        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
-                seq_printf(seq, "#%-5lu: I/O error\n", group);
+                seq_printf(seq, "#%-5u: I/O error\n", group);
                return 0;
        }
        ext4_lock_group(sb, group);
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        ext4_unlock_group(sb, group);
        ext4_mb_release_desc(&e4b);
-        seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                        sg.info.bb_fragments, sg.info.bb_first_free);
        for (i = 0; i <= 13; i++)
                seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        ext4_free_blocks_after_init(sb, group, desc);
        } else {
                meta_group_info[i]->bb_free =
-                        le16_to_cpu(desc->bg_free_blocks_count);
+                        ext4_free_blks_count(sb, desc);
        }
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root.rb_node = NULL;;
 #ifdef DOUBLE_CHECK
@@ -2327,54 +2571,6 @@ exit_meta_group_info:
 } /* ext4_mb_add_groupinfo */
 /*
- * Add a group to the existing groups.
- * This function is used for online resize
- */
-int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
-                               struct ext4_group_desc *desc)
-{
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct inode *inode = sbi->s_buddy_cache;
-        int blocks_per_page;
-        int block;
-        int pnum;
-        struct page *page;
-        int err;
-        /* Add group based on group descriptor*/
-        err = ext4_mb_add_groupinfo(sb, group, desc);
-        if (err)
-                return err;
-        /*
-         * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
-         * datas) are set not up to date so that they will be re-initilaized
-         * during the next call to ext4_mb_load_buddy
-         */
-        /* Set buddy page as not up to date */
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        page = find_get_page(inode->i_mapping, pnum);
-        if (page != NULL) {
-                ClearPageUptodate(page);
-                page_cache_release(page);
-        }
-        /* Set bitmap page as not up to date */
-        block++;
-        pnum = block / blocks_per_page;
-        page = find_get_page(inode->i_mapping, pnum);
-        if (page != NULL) {
-                ClearPageUptodate(page);
-                page_cache_release(page);
-        }
-        return 0;
-}
-/*
 * Update an existing group.
 * This function is used for online resize
 */
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
-                                "EXT4-fs: can't read descriptor %lu\n", i);
+                                "EXT4-fs: can't read descriptor %u\n", i);
                        goto err_freebuddy;
                }
                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        if (sbi->s_mb_offsets == NULL) {
                return -ENOMEM;
        }
+        i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
                kfree(sbi->s_mb_maxs);
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        ext4_mb_init_per_dev_proc(sb);
        ext4_mb_history_init(sb);
-        sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+        if (sbi->s_journal)
+                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
        printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
        return 0;
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        list_for_each_safe(l, ltmp, &txn->t_private_list) {
                entry = list_entry(l, struct ext4_free_data, list);
-                mb_debug("gonna free %u blocks in group %lu (0x%p):",
+                mb_debug("gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
                        + entry->start_blk
                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-                trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
+                trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
-                           (unsigned long long) discard_block, entry->count);
+                           sb->s_id, (unsigned long long) discard_block,
+                           entry->count);
                sb_issue_discard(sb, discard_block, entry->count);
                kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void)
 */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-                                handle_t *handle, unsigned long reserv_blks)
+                                handle_t *handle, unsigned int reserv_blks)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_super_block *es;
@@ -2824,8 +3024,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (!gdp)
                goto out_err;
-        ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
-                        gdp->bg_free_blocks_count);
+                        ext4_free_blks_count(sb, gdp));
        err = ext4_journal_get_write_access(handle, gdp_bh);
        if (err)
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
            in_range(block + len - 1, ext4_inode_table(sb, gdp),
                     EXT4_SB(sb)->s_itb_per_group)) {
                ext4_error(sb, __func__,
-                           "Allocating block in system zone - block = %llu",
+                           "Allocating block %llu in system zone of %d group\n",
-                           block);
+                           block, ac->ac_b_ex.fe_group);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
                 * We leak some of the blocks here.
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
                                bitmap_bh->b_data, ac->ac_b_ex.fe_start,
                                ac->ac_b_ex.fe_len);
-                err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
                        err = -EAGAIN;
                goto out_err;
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                }
        }
 #endif
-        mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
-                                ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
        spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        mb_set_bits(NULL, bitmap_bh->b_data,
+                                ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-                gdp->bg_free_blocks_count =
+                ext4_free_blks_set(sb, gdp,
-                        cpu_to_le16(ext4_free_blocks_after_init(sb,
+                                        ext4_free_blocks_after_init(sb,
-                                                ac->ac_b_ex.fe_group,
+                                        ac->ac_b_ex.fe_group, gdp));
-                                                gdp));
        }
-        le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
+        len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+        ext4_free_blks_set(sb, gdp, len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
-        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        if (err)
                goto out_err;
-        err = ext4_journal_dirty_metadata(handle, gdp_bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
 out_err:
        sb->s_dirt = 1;
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        /* check we don't cross already preallocated blocks */
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-                unsigned long pa_end;
+                ext4_lblk_t pa_end;
                if (pa->pa_deleted)
                        continue;
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        /* XXX: extra loop to check we really don't overlap preallocations */
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-                unsigned long pa_end;
+                ext4_lblk_t pa_end;
                spin_lock(&pa->pa_lock);
                if (pa->pa_deleted == 0) {
                        pa_end = pa->pa_lstart + pa->pa_len;
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 }
 /*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+                                                ext4_group_t group)
+{
+        struct rb_node *n;
+        struct ext4_group_info *grp;
+        struct ext4_free_data *entry;
+        grp = ext4_get_group_info(sb, group);
+        n = rb_first(&(grp->bb_free_root));
+        while (n) {
+                entry = rb_entry(n, struct ext4_free_data, node);
+                mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                                bitmap, entry->start_blk,
+                                entry->count);
+                n = rb_next(n);
+        }
+        return;
+}
+/*
 * the function goes through all preallocation in this group and marks them
 * used in in-core bitmap. buddy must be generated from this bitmap
 * Need to be called with ext4 group lock (ext4_lock_group)
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                preallocated += len;
                count++;
        }
-        mb_debug("prellocated %u for group %lu\n", preallocated, group);
+        mb_debug("prellocated %u for group %u\n", preallocated, group);
 }
 static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                        struct super_block *sb, struct ext4_prealloc_space *pa)
 {
-        unsigned long grp;
+        ext4_group_t grp;
        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
                return;
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+        trace_mark(ext4_mb_new_inode_pa,
+                   "dev %s ino %lu pstart %llu len %u lstart %u",
+                   sb->s_id, ac->ac_inode->i_ino,
+                   pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        ext4_mb_use_inode_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        pa->pa_linear = 1;
        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
-                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+                 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+        trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
+                   sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        ext4_mb_use_group_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 {
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        unsigned long end;
+        unsigned int end;
-        unsigned long next;
+        unsigned int next;
        ext4_group_t group;
        ext4_grpblk_t bit;
+        unsigned long long grp_blk_start;
        sector_t start;
        int err = 0;
        int free = 0;
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+        grp_blk_start = pa->pa_pstart - bit;
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        ext4_mb_store_history(ac);
                }
+                trace_mark(ext4_mb_release_inode_pa,
+                           "dev %s ino %lu block %llu count %u",
+                           sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
+                           next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        pa, (unsigned long) pa->pa_lstart,
                        (unsigned long) pa->pa_pstart,
                        (unsigned long) pa->pa_len);
-                ext4_error(sb, __func__, "free %u, pa_free %u\n",
+                ext4_grp_locked_error(sb, group,
-                                                free, pa->pa_free);
+                                        __func__, "free %u, pa_free %u",
+                                        free, pa->pa_free);
                /*
                 * pa is already deleted so we use the value obtained
                 * from the bitmap and continue.
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        if (ac)
                ac->ac_op = EXT4_MB_HISTORY_DISCARD;
+        trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
+                   sb->s_id, pa->pa_pstart, pa->pa_len);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        int busy = 0;
        int free = 0;
-        mb_debug("discard preallocation for group %lu\n", group);
+        mb_debug("discard preallocation for group %u\n", group);
        if (list_empty(&grp->bb_prealloc_list))
                return 0;
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (bitmap_bh == NULL) {
                ext4_error(sb, __func__, "Error in reading block "
-                                "bitmap for %lu\n", group);
+                                "bitmap for %u", group);
                return 0;
        }
        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
                ext4_error(sb, __func__, "Error in loading buddy "
-                                "information for %lu\n", group);
+                                "information for %u", group);
                put_bh(bitmap_bh);
                return 0;
        }
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
        }
        mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+        trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
+                   inode->i_ino);
        INIT_LIST_HEAD(&list);
@@ -3874,14 +4116,14 @@ repeat:
                err = ext4_mb_load_buddy(sb, group, &e4b);
                if (err) {
                        ext4_error(sb, __func__, "Error in loading buddy "
-                                        "information for %lu\n", group);
+                                        "information for %u", group);
                        continue;
                }
                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (bitmap_bh == NULL) {
                        ext4_error(sb, __func__, "Error in reading block "
-                                        "bitmap for %lu\n", group);
+                                        "bitmap for %u", group);
                        ext4_mb_release_desc(&e4b);
                        continue;
                }
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_group_t group;
-        unsigned long len;
+        unsigned int len;
-        unsigned long goal;
+        ext4_fsblk_t goal;
        ext4_grpblk_t block;
        /* we can't allocate > group size */
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ac->ac_pa = NULL;
        ac->ac_bitmap_page = NULL;
        ac->ac_buddy_page = NULL;
+        ac->alloc_semp = NULL;
        ac->ac_lg = NULL;
        /* we have to define context: we'll we work with a file or
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
                if (ext4_mb_load_buddy(sb, group, &e4b)) {
                        ext4_error(sb, __func__, "Error in loading buddy "
-                                        "information for %lu\n", group);
+                                        "information for %u", group);
                        continue;
                }
                ext4_lock_group(sb, group);
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                }
                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
+        if (ac->alloc_semp)
+                up_read(ac->alloc_semp);
        if (ac->ac_bitmap_page)
                page_cache_release(ac->ac_bitmap_page);
        if (ac->ac_buddy_page)
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
        int ret;
        int freed = 0;
+        trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
+                   sb->s_id, needed);
        for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
                freed += ret;
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block = 0;
-        unsigned long inquota;
+        unsigned int inquota;
-        unsigned long reserv_blks = 0;
+        unsigned int reserv_blks = 0;
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);
+        trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
+                   "lblk %llu goal %llu lleft %llu lright %llu "
+                   "pleft %llu pright %llu ",
+                   sb->s_id, ar->flags, ar->len,
+                   ar->inode ? ar->inode->i_ino : 0,
+                   (unsigned long long) ar->logical,
+                   (unsigned long long) ar->goal,
+                   (unsigned long long) ar->lleft,
+                   (unsigned long long) ar->lright,
+                   (unsigned long long) ar->pleft,
+                   (unsigned long long) ar->pright);
        if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
                /*
                 * With delalloc we already reserved the blocks
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        }
        if (ar->len == 0) {
                *errp = -EDQUOT;
-                return 0;
+                goto out3;
        }
        inquota = ar->len;
@@ -4348,10 +4607,14 @@ repeat:
                                ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
                        ext4_mb_new_preallocation(ac);
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
                if (*errp ==  -EAGAIN) {
+                        /*
+                         * drop the reference that we took
+                         * in ext4_mb_use_best_found
+                         */
+                        ext4_mb_release_context(ac);
                        ac->ac_b_ex.fe_group = 0;
                        ac->ac_b_ex.fe_start = 0;
                        ac->ac_b_ex.fe_len = 0;
@@ -4382,6 +4645,26 @@ out2:
 out1:
        if (ar->len < inquota)
                DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+out3:
+        if (!ar->len) {
+                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                        /* release all the reserved blocks if non delalloc */
+                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                                reserv_blks);
+        }
+        trace_mark(ext4_allocate_blocks,
+                   "dev %s block %llu flags %u len %u ino %lu "
+                   "logical %llu goal %llu lleft %llu lright %llu "
+                   "pleft %llu pright %llu ",
+                   sb->s_id, (unsigned long long) block,
+                   ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
+                   (unsigned long long) ar->logical,
+                   (unsigned long long) ar->goal,
+                   (unsigned long long) ar->lleft,
+                   (unsigned long long) ar->lright,
+                   (unsigned long long) ar->pleft,
+                   (unsigned long long) ar->pright);
        return block;
 }
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1,
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
-                          ext4_group_t group, ext4_grpblk_t block, int count)
+                      struct ext4_free_data *new_entry)
 {
+        ext4_grpblk_t block;
+        struct ext4_free_data *entry;
        struct ext4_group_info *db = e4b->bd_info;
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_free_data *entry, *new_entry;
        struct rb_node **n = &db->bb_free_root.rb_node, *node;
        struct rb_node *parent = NULL, *new_node;
+        BUG_ON(!ext4_handle_valid(handle));
        BUG_ON(e4b->bd_bitmap_page == NULL);
        BUG_ON(e4b->bd_buddy_page == NULL);
-        new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
-        new_entry->start_blk = block;
-        new_entry->group  = group;
-        new_entry->count = count;
-        new_entry->t_tid = handle->h_transaction->t_tid;
        new_node = &new_entry->node;
+        block = new_entry->start_blk;
-        ext4_lock_group(sb, group);
        if (!*n) {
                /* first free block exent. We need to
                   protect buddy cache from being freed,
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                else if (block >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
-                        ext4_unlock_group(sb, group);
+                        ext4_grp_locked_error(sb, e4b->bd_group, __func__,
-                        ext4_error(sb, __func__,
+                                        "Double free of blocks %d (%d %d)",
-                            "Double free of blocks %d (%d %d)\n",
+                                        block, entry->start_blk, entry->count);
-                            block, entry->start_blk, entry->count);
                        return 0;
                }
        }
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        spin_lock(&sbi->s_md_lock);
        list_add(&new_entry->list, &handle->h_transaction->t_private_list);
        spin_unlock(&sbi->s_md_lock);
-        ext4_unlock_group(sb, group);
        return 0;
 }
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
-        unsigned long overflow;
+        unsigned int overflow;
        ext4_grpblk_t bit;
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        }
        ext4_debug("freeing block %lu\n", block);
+        trace_mark(ext4_free_blocks,
+                   "dev %s block %llu count %lu metadata %d ino %lu",
+                   sb->s_id, (unsigned long long) block, count, metadata,
+                   inode ? inode->i_ino : 0);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
@@ -4581,11 +4862,6 @@ do_more:
        err = ext4_journal_get_write_access(handle, gd_bh);
        if (err)
                goto error_return;
-        err = ext4_mb_load_buddy(sb, block_group, &e4b);
-        if (err)
-                goto error_return;
 #ifdef AGGRESSIVE_CHECK
        {
                int i;
@@ -4593,13 +4869,6 @@ do_more:
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-        mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-                        bit, count);
-        /* We dirtied the bitmap block */
-        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
        if (ac) {
                ac->ac_b_ex.fe_group = block_group;
                ac->ac_b_ex.fe_start = bit;
@@ -4607,19 +4876,41 @@ do_more:
                ext4_mb_store_history(ac);
        }
-        if (metadata) {
+        err = ext4_mb_load_buddy(sb, block_group, &e4b);
-                /* blocks being freed are metadata. these blocks shouldn't
+        if (err)
-                 * be used until this transaction is committed */
+                goto error_return;
-                ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+        if (metadata && ext4_handle_valid(handle)) {
+                struct ext4_free_data *new_entry;
+                /*
+                 * blocks being freed are metadata. these blocks shouldn't
+                 * be used until this transaction is committed
+                 */
+                new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry->start_blk = bit;
+                new_entry->group  = block_group;
+                new_entry->count = count;
+                new_entry->t_tid = handle->h_transaction->t_tid;
+                ext4_lock_group(sb, block_group);
+                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                                bit, count);
+                ext4_mb_free_metadata(handle, &e4b, new_entry);
+                ext4_unlock_group(sb, block_group);
        } else {
                ext4_lock_group(sb, block_group);
+                /* need to update group_info->bb_free and bitmap
+                 * with group lock held. generate_buddy look at
+                 * them with group lock_held
+                 */
+                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                                bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
                ext4_unlock_group(sb, block_group);
        }
        spin_lock(sb_bgl_lock(sbi, block_group));
-        le16_add_cpu(&gdp->bg_free_blocks_count, count);
+        ret = ext4_free_blks_count(sb, gdp) + count;
+        ext4_free_blks_set(sb, gdp, ret);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, block_group));
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -4635,9 +4926,13 @@ do_more:
        *freed += count;
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        /* And the group descriptor block */
        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-        ret = ext4_journal_dirty_metadata(handle, gd_bh);
+        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
        if (!err)
                err = ret;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1fff1e5..10a2921baf14 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
 #include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/marker.h>
+#include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "group.h"
@@ -98,9 +99,6 @@
 */
 #define MB_DEFAULT_GROUP_PREALLOC       512
-static struct kmem_cache *ext4_pspace_cachep;
-static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
 struct ext4_free_data {
        /* this links the free block information from group_info */
@@ -120,26 +118,6 @@ struct ext4_free_data {
        tid_t   t_tid;
 };
-struct ext4_group_info {
-        unsigned long   bb_state;
-        struct rb_root  bb_free_root;
-        unsigned short  bb_first_free;
-        unsigned short  bb_free;
-        unsigned short  bb_fragments;
-        struct          list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
-        void            *bb_bitmap;
-#endif
-        unsigned short  bb_counters[];
-};
-#define EXT4_GROUP_INFO_NEED_INIT_BIT   0
-#define EXT4_GROUP_INFO_LOCKED_BIT      1
-#define EXT4_MB_GRP_NEED_INIT(grp)      \
-        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 struct ext4_prealloc_space {
        struct list_head        pa_inode_list;
        struct list_head        pa_group_list;
@@ -217,6 +195,11 @@ struct ext4_allocation_context {
        __u8 ac_op;             /* operation, for history only */
        struct page *ac_bitmap_page;
        struct page *ac_buddy_page;
+        /*
+         * pointer to the held semaphore upon successful
+         * block allocation
+         */
+        struct rw_semaphore *alloc_semp;
        struct ext4_prealloc_space *ac_pa;
        struct ext4_locality_group *ac_lg;
 };
@@ -250,6 +233,7 @@ struct ext4_buddy {
        struct super_block *bd_sb;
        __u16 bd_blkbits;
        ext4_group_t bd_group;
+        struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 {
        return;
 }
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 #endif
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
-                                        ext4_group_t group);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-                                        struct ext4_buddy *e4b, sector_t block,
-                                        int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
-                        struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-        bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-static inline void ext4_unlock_group(struct super_block *sb,
-                                        ext4_group_t group)
-{
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-        bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-static inline int ext4_is_group_locked(struct super_block *sb,
-                                        ext4_group_t group)
-{
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-        return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-                                                &(grinfo->bb_state));
-}
-static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
        ext4_fsblk_t block;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f2a9cf498ecd..734abca25e35 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
        /*
         * Make sure the credit we accumalated is not really high
         */
-        if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+        if (needed && ext4_handle_has_enough_credits(handle,
+                                                EXT4_RESERVE_TRANS_BLOCKS)) {
                retval = ext4_journal_restart(handle, needed);
                if (retval)
                        goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
 {
        int retval = 0, needed;
-        if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+        if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
                return 0;
        /*
         * We are freeing a blocks. During this we touch
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
        struct list_blocks_struct lb;
        unsigned long max_entries;
-        if (!test_opt(inode->i_sb, EXTENTS))
+        /*
-                /*
+         * If the filesystem does not support extents, or the inode
-                 * if mounted with noextents we don't allow the migrate
+         * already is extent-based, error out.
-                 */
+         */
-                return -EINVAL;
+        if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
-        if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+            (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
                return -EINVAL;
        if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 63adcb792988..ba702bd7910d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                goto fail;
        }
        hinfo->hash_version = root->info.hash_version;
+        if (hinfo->hash_version <= DX_HASH_TEA)
+                hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        if (d_name)
                ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        dir = dir_file->f_path.dentry->d_inode;
        if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+                if (hinfo.hash_version <= DX_HASH_TEA)
+                        hinfo.hash_version +=
+                                EXT4_SB(dir->i_sb)->s_hash_unsigned;
                hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                               start_hash, start_minor_hash);
@@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name,
 static inline int search_dirblock(struct buffer_head *bh,
                                  struct inode *dir,
                                  const struct qstr *d_name,
-                                  unsigned long offset,
+                                  unsigned int offset,
                                  struct ext4_dir_entry_2 ** res_dir)
 {
        struct ext4_dir_entry_2 * de;
@@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
        bh = ext4_find_entry(dir, &dentry->d_name, &de);
        inode = NULL;
        if (bh) {
-                unsigned long ino = le32_to_cpu(de->inode);
+                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
                        ext4_error(dir->i_sb, "ext4_lookup",
-                                   "bad inode number: %lu", ino);
+                                   "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
@@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 struct dentry *ext4_get_parent(struct dentry *child)
 {
-        unsigned long ino;
+        __u32 ino;
        struct inode *inode;
        static const struct qstr dotdot = {
                .name = "..",
@@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
                ext4_error(child->d_inode->i_sb, "ext4_get_parent",
-                           "bad inode number: %lu", ino);
+                           "bad inode number: %u", ino);
                return ERR_PTR(-EIO);
        }
@@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        u32 hash2;
        struct dx_map_entry *map;
        char *data1 = (*bh)->b_data, *data2;
-        unsigned split, move, size, i;
+        unsigned split, move, size;
        struct ext4_dir_entry_2 *de = NULL, *de2;
-        int     err = 0;
+        int     err = 0, i;
        bh2 = ext4_append (handle, dir, &newblock, &err);
        if (!(bh2)) {
@@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                de = de2;
        }
        dx_insert_block(frame, hash2 + continued, newblock);
-        err = ext4_journal_dirty_metadata(handle, bh2);
+        err = ext4_handle_dirty_metadata(handle, dir, bh2);
        if (err)
                goto journal_error;
-        err = ext4_journal_dirty_metadata(handle, frame->bh);
+        err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
        if (err)
                goto journal_error;
        brelse(bh2);
@@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        struct inode    *dir = dentry->d_parent->d_inode;
        const char      *name = dentry->d_name.name;
        int             namelen = dentry->d_name.len;
-        unsigned long   offset = 0;
+        unsigned int    offset = 0;
        unsigned short  reclen;
        int             nlen, rlen, err;
        char            *top;
@@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        ext4_update_dx_flag(dir);
        dir->i_version++;
        ext4_mark_inode_dirty(handle, dir);
-        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-        err = ext4_journal_dirty_metadata(handle, bh);
+        err = ext4_handle_dirty_metadata(handle, dir, bh);
        if (err)
                ext4_std_error(dir->i_sb, err);
        brelse(bh);
@@ -1367,7 +1368,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        struct fake_dirent *fde;
        blocksize =  dir->i_sb->s_blocksize;
-        dxtrace(printk(KERN_DEBUG "Creating index\n"));
+        dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
        retval = ext4_journal_get_write_access(handle, bh);
        if (retval) {
                ext4_std_error(dir->i_sb, retval);
@@ -1376,6 +1377,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        }
        root = (struct dx_root *) bh->b_data;
+        /* The 0th block becomes the root, move the dirents out */
+        fde = &root->dotdot;
+        de = (struct ext4_dir_entry_2 *)((char *)fde +
+                ext4_rec_len_from_disk(fde->rec_len));
+        if ((char *) de >= (((char *) root) + blocksize)) {
+                ext4_error(dir->i_sb, __func__,
+                           "invalid rec_len for '..' in inode %lu",
+                           dir->i_ino);
+                brelse(bh);
+                return -EIO;
+        }
+        len = ((char *) root) + blocksize - (char *) de;
+        /* Allocate new block for the 0th block's dirents */
        bh2 = ext4_append(handle, dir, &block, &retval);
        if (!(bh2)) {
                brelse(bh);
@@ -1384,11 +1399,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
        data1 = bh2->b_data;
-        /* The 0th block becomes the root, move the dirents out */
-        fde = &root->dotdot;
-        de = (struct ext4_dir_entry_2 *)((char *)fde +
-                ext4_rec_len_from_disk(fde->rec_len));
-        len = ((char *) root) + blocksize - (char *) de;
        memcpy (data1, de, len);
        de = (struct ext4_dir_entry_2 *) data1;
        top = data1 + len;
@@ -1408,6 +1418,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* Initialize as for dx_probe */
        hinfo.hash_version = root->info.hash_version;
+        if (hinfo.hash_version <= DX_HASH_TEA)
+                hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        ext4fs_dirhash(name, namelen, &hinfo);
        frame = frames;
@@ -1437,7 +1449,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                          struct inode *inode)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        unsigned long offset;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
@@ -1459,7 +1470,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                ext4_mark_inode_dirty(handle, dir);
        }
        blocks = dir->i_size >> sb->s_blocksize_bits;
-        for (block = 0, offset = 0; block < blocks; block++) {
+        for (block = 0; block < blocks; block++) {
                bh = ext4_bread(handle, dir, block, 0, &retval);
                if(!bh)
                        return retval;
@@ -1574,7 +1585,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        dxtrace(dx_show_index("node", frames[1].entries));
                        dxtrace(dx_show_index("node",
                               ((struct dx_node *) bh2->b_data)->entries));
-                        err = ext4_journal_dirty_metadata(handle, bh2);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh2);
                        if (err)
                                goto journal_error;
                        brelse (bh2);
@@ -1600,7 +1611,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext4_journal_dirty_metadata(handle, frames[0].bh);
+                ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1646,8 +1657,8 @@ static int ext4_delete_entry(handle_t *handle,
                        else
                                de->inode = 0;
                        dir->i_version++;
-                        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_journal_dirty_metadata(handle, bh);
+                        ext4_handle_dirty_metadata(handle, dir, bh);
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len);
@@ -1693,9 +1704,11 @@ static int ext4_add_nondir(handle_t *handle,
        if (!err) {
                ext4_mark_inode_dirty(handle, inode);
                d_instantiate(dentry, inode);
+                unlock_new_inode(inode);
                return 0;
        }
        drop_nlink(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -1723,7 +1736,7 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = ext4_new_inode (handle, dir, mode);
        err = PTR_ERR(inode);
@@ -1757,7 +1770,7 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = ext4_new_inode(handle, dir, mode);
        err = PTR_ERR(inode);
@@ -1793,7 +1806,7 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
        err = PTR_ERR(inode);
@@ -1822,14 +1835,15 @@ retry:
        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
-        BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
+        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        ext4_journal_dirty_metadata(handle, dir_block);
+        ext4_handle_dirty_metadata(handle, dir, dir_block);
        brelse(dir_block);
        ext4_mark_inode_dirty(handle, inode);
        err = ext4_add_entry(handle, dentry, inode);
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
+                unlock_new_inode(inode);
                ext4_mark_inode_dirty(handle, inode);
                iput(inode);
                goto out_stop;
@@ -1838,6 +1852,7 @@ out_clear_inode:
        ext4_update_dx_flag(dir);
        ext4_mark_inode_dirty(handle, dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
 out_stop:
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
@@ -1850,7 +1865,7 @@ out_stop:
 */
 static int empty_dir(struct inode *inode)
 {
-        unsigned long offset;
+        unsigned int offset;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de, *de1;
        struct super_block *sb;
@@ -1895,7 +1910,7 @@ static int empty_dir(struct inode *inode)
                                if (err)
                                        ext4_error(sb, __func__,
                                                   "error %d reading directory"
-                                                   " #%lu offset %lu",
+                                                   " #%lu offset %u",
                                                   err, inode->i_ino, offset);
                                offset += sb->s_blocksize;
                                continue;
@@ -1933,6 +1948,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        struct ext4_iloc iloc;
        int err = 0, rc;
+        if (!ext4_handle_valid(handle))
+                return 0;
        lock_super(sb);
        if (!list_empty(&EXT4_I(inode)->i_orphan))
                goto out_unlock;
@@ -1961,7 +1979,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        /* Insert this inode at the head of the on-disk orphan list... */
        NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
        EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-        err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
        rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
        if (!err)
                err = rc;
@@ -1995,10 +2013,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
        struct list_head *prev;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi;
-        unsigned long ino_next;
+        __u32 ino_next;
        struct ext4_iloc iloc;
        int err = 0;
+        if (!ext4_handle_valid(handle))
+                return 0;
        lock_super(inode->i_sb);
        if (list_empty(&ei->i_orphan)) {
                unlock_super(inode->i_sb);
@@ -2017,7 +2038,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
         * transaction handle with which to update the orphan list on
         * disk, but we still need to remove the inode from the linked
         * list in memory. */
-        if (!handle)
+        if (sbi->s_journal && !handle)
                goto out;
        err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2025,19 +2046,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
                goto out_err;
        if (prev == &sbi->s_orphan) {
-                jbd_debug(4, "superblock will point to %lu\n", ino_next);
+                jbd_debug(4, "superblock will point to %u\n", ino_next);
                BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sbi->s_sbh);
                if (err)
                        goto out_brelse;
                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-                err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+                err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
        } else {
                struct ext4_iloc iloc2;
                struct inode *i_prev =
                        &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
-                jbd_debug(4, "orphan inode %lu will point to %lu\n",
+                jbd_debug(4, "orphan inode %lu will point to %u\n",
                          i_prev->i_ino, ino_next);
                err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
                if (err)
@@ -2082,7 +2103,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
                goto end_rmdir;
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = dentry->d_inode;
@@ -2136,7 +2157,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2193,7 +2214,7 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
        err = PTR_ERR(inode);
@@ -2208,10 +2229,10 @@ retry:
                 * We have a transaction open.  All is sweetness.  It also sets
                 * i_size in generic_commit_write().
                 */
-                err = __page_symlink(inode, symname, l,
+                err = __page_symlink(inode, symname, l, 1);
-                                mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
                if (err) {
                        clear_nlink(inode);
+                        unlock_new_inode(inode);
                        ext4_mark_inode_dirty(handle, inode);
                        iput(inode);
                        goto out_stop;
@@ -2256,13 +2277,20 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode->i_ctime = ext4_current_time(inode);
        ext4_inc_count(handle, inode);
        atomic_inc(&inode->i_count);
-        err = ext4_add_nondir(handle, dentry, inode);
+        err = ext4_add_entry(handle, dentry, inode);
+        if (!err) {
+                ext4_mark_inode_dirty(handle, inode);
+                d_instantiate(dentry, inode);
+        } else {
+                drop_nlink(inode);
+                iput(inode);
+        }
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -2298,7 +2326,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                return PTR_ERR(handle);
        if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
        /*
@@ -2352,8 +2380,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_dir->i_ctime = new_dir->i_mtime =
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
-                BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                ext4_journal_dirty_metadata(handle, new_bh);
+                ext4_handle_dirty_metadata(handle, new_dir, new_bh);
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2403,8 +2431,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                BUFFER_TRACE(dir_bh, "get_write_access");
                ext4_journal_get_write_access(handle, dir_bh);
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
-                BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                ext4_journal_dirty_metadata(handle, dir_bh);
+                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6ec1843a015..c06886abd658 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
        ext4_get_group_no_and_offset(sb, start, NULL, &offset);
        if (group != sbi->s_groups_count)
                ext4_warning(sb, __func__,
-                             "Cannot add at group %u (only %lu groups)",
+                             "Cannot add at group %u (only %u groups)",
                             input->group, sbi->s_groups_count);
        else if (offset != 0)
                        ext4_warning(sb, __func__, "Last group not full");
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
 {
        int err;
-        if (handle->h_buffer_credits >= thresh)
+        if (ext4_handle_has_enough_credits(handle, thresh))
                return 0;
        err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext4_journal_dirty_metadata(handle, gdb);
+                ext4_handle_dirty_metadata(handle, NULL, gdb);
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(bh);
                        goto exit_bh;
                }
-                ext4_journal_dirty_metadata(handle, gdb);
+                ext4_handle_dirty_metadata(handle, NULL, gdb);
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(it);
                        goto exit_bh;
                }
-                ext4_journal_dirty_metadata(handle, it);
+                ext4_handle_dirty_metadata(handle, NULL, it);
                brelse(it);
                ext4_set_bit(bit, bh->b_data);
        }
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
-        mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
+        mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
-                        bh->b_data);
+        ext4_handle_dirty_metadata(handle, NULL, bh);
-        ext4_journal_dirty_metadata(handle, bh);
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
                   input->inode_bitmap, input->inode_bitmap - start);
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb,
                goto exit_journal;
        }
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
-        ext4_journal_dirty_metadata(handle, bh);
+        ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
        brelse(bh);
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-        ext4_journal_dirty_metadata(handle, dind);
+        ext4_handle_dirty_metadata(handle, NULL, dind);
        brelse(dind);
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext4_journal_dirty_metadata(handle, *primary);
+        ext4_handle_dirty_metadata(handle, NULL, *primary);
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        return 0;
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
                       primary[i]->b_blocknr, gdbackups,
                       blk + primary[i]->b_blocknr); */
                data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
-                err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+                err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
                if (!err)
                        err = err2;
        }
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb,
                struct buffer_head *bh;
                /* Out of journal space, and can't get more - abort - so sad */
-                if (handle->h_buffer_credits == 0 &&
+                if (ext4_handle_valid(handle) &&
+                    handle->h_buffer_credits == 0 &&
                    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
                    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
                        break;
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb,
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext4_journal_dirty_metadata(handle, bh);
+                ext4_handle_dirty_metadata(handle, NULL, bh);
                brelse(bh);
        }
        if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb,
 exit_err:
        if (err) {
                ext4_warning(sb, __func__,
-                             "can't update backup for group %lu (err %d), "
+                             "can't update backup for group %u (err %d), "
                             "forcing fsck on next reboot", group, err);
                sbi->s_mount_state &= ~EXT4_VALID_FS;
                sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        struct inode *inode = NULL;
        handle_t *handle;
        int gdb_off, gdb_num;
+        int num_grp_locked = 0;
        int err, err2;
        gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if (ext4_blocks_count(es) + input->blocks_count <
            ext4_blocks_count(es)) {
-                ext4_warning(sb, __func__, "blocks_count overflow\n");
+                ext4_warning(sb, __func__, "blocks_count overflow");
                return -EINVAL;
        }
        if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
            le32_to_cpu(es->s_inodes_count)) {
-                ext4_warning(sb, __func__, "inodes_count overflow\n");
+                ext4_warning(sb, __func__, "inodes_count overflow");
                return -EINVAL;
        }
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                }
        }
        if ((err = verify_group_input(sb, input)))
                goto exit_put;
@@ -855,24 +856,29 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         * using the new disk blocks.
         */
+        num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
        /* Update group descriptor block for new group */
        gdp = (struct ext4_group_desc *)((char *)primary->b_data +
                                         gdb_off * EXT4_DESC_SIZE(sb));
+        memset(gdp, 0, EXT4_DESC_SIZE(sb));
        ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
        ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
        ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-        gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
+        ext4_free_blks_set(sb, gdp, input->free_blocks_count);
-        gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+        ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+        gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
        /*
         * We can allocate memory for mb_alloc based on the new group
         * descriptor
         */
-        err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+        err = ext4_mb_add_groupinfo(sb, input->group, gdp);
-        if (err)
+        if (err) {
+                ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
                goto exit_journal;
+        }
        /*
         * Make the new blocks and inodes valid next.  We do this before
@@ -914,8 +920,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
+        ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
-        ext4_journal_dirty_metadata(handle, primary);
+        ext4_handle_dirty_metadata(handle, NULL, primary);
        /* Update the reserved block counts only once the new group is
         * active. */
@@ -937,7 +944,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                        EXT4_INODES_PER_GROUP(sb);
        }
-        ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+        ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
        sb->s_dirt = 1;
 exit_journal:
@@ -975,9 +982,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        struct buffer_head *bh;
        handle_t *handle;
        int err;
-        unsigned long freed_blocks;
        ext4_group_t group;
-        struct ext4_group_info *grp;
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
@@ -997,8 +1002,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                        " too large to resize to %llu blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
-                        ext4_warning(sb, __func__,
+                        ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
-                        "CONFIG_LBD not enabled\n");
                return -EINVAL;
        }
@@ -1071,62 +1075,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
-        ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        sb->s_dirt = 1;
        unlock_super(sb);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
-        ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+        /* We add the blocks to the bitmap and set the group need init bit */
+        ext4_add_groupblocks(handle, sb, o_blocks_count, add);
        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        if ((err = ext4_journal_stop(handle)))
                goto exit_put;
-        /*
-         * Mark mballoc pages as not up to date so that they will be updated
-         * next time they are loaded by ext4_mb_load_buddy.
-         *
-         * XXX Bad, Bad, BAD!!!  We should not be overloading the
-         * Uptodate flag, particularly on thte bitmap bh, as way of
-         * hinting to ext4_mb_load_buddy() that it needs to be
-         * overloaded.  A user could take a LVM snapshot, then do an
-         * on-line fsck, and clear the uptodate flag, and this would
-         * not be a bug in userspace, but a bug in the kernel.  FIXME!!!
-         */
-        {
-                struct ext4_sb_info *sbi = EXT4_SB(sb);
-                struct inode *inode = sbi->s_buddy_cache;
-                int blocks_per_page;
-                int block;
-                int pnum;
-                struct page *page;
-                /* Set buddy page as not up to date */
-                blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-                block = group * 2;
-                pnum = block / blocks_per_page;
-                page = find_get_page(inode->i_mapping, pnum);
-                if (page != NULL) {
-                        ClearPageUptodate(page);
-                        page_cache_release(page);
-                }
-                /* Set bitmap page as not up to date */
-                block++;
-                pnum = block / blocks_per_page;
-                page = find_get_page(inode->i_mapping, pnum);
-                if (page != NULL) {
-                        ClearPageUptodate(page);
-                        page_cache_release(page);
-                }
-                /* Get the info on the last group */
-                grp = ext4_get_group_info(sb, group);
-                /* Update free blocks in group info */
-                ext4_mb_update_group_info(grp, add);
-        }
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
                       ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04158ad74dbb..e5f06a5f045e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,9 +51,7 @@ struct proc_dir_entry *ext4_proc_root;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
-static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
+static int ext4_commit_super(struct super_block *sb,
-                               unsigned int);
-static void ext4_commit_super(struct super_block *sb,
                              struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
@@ -64,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
-static void ext4_unlockfs(struct super_block *sb);
+static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
-static void ext4_write_super_lockfs(struct super_block *sb);
+static int ext4_freeze(struct super_block *sb);
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
+__u32 ext4_free_blks_count(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+{
+        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+__u32 ext4_free_inodes_count(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+{
+        return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+__u32 ext4_used_dirs_count(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+{
+        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+__u32 ext4_itable_unused_count(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+{
+        return le16_to_cpu(bg->bg_itable_unused_lo) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
 void ext4_block_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
 {
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb,
                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
+void ext4_free_blks_set(struct super_block *sb,
+                          struct ext4_group_desc *bg, __u32 count)
+{
+        bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+void ext4_free_inodes_set(struct super_block *sb,
+                          struct ext4_group_desc *bg, __u32 count)
+{
+        bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+void ext4_used_dirs_set(struct super_block *sb,
+                          struct ext4_group_desc *bg, __u32 count)
+{
+        bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+void ext4_itable_unused_set(struct super_block *sb,
+                          struct ext4_group_desc *bg, __u32 count)
+{
+        bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
 /*
 * Wrappers for jbd2_journal_start/end.
 *
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
-        if (is_journal_aborted(journal)) {
+        if (journal) {
-                ext4_abort(sb, __func__,
+                if (is_journal_aborted(journal)) {
-                           "Detected aborted journal");
+                        ext4_abort(sb, __func__,
-                return ERR_PTR(-EROFS);
+                                   "Detected aborted journal");
+                        return ERR_PTR(-EROFS);
+                }
+                return jbd2_journal_start(journal, nblocks);
        }
+        /*
-        return jbd2_journal_start(journal, nblocks);
+         * We're not journaling, return the appropriate indication.
+         */
+        current->journal_info = EXT4_NOJOURNAL_HANDLE;
+        return current->journal_info;
 }
 /*
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
        int err;
        int rc;
+        if (!ext4_handle_valid(handle)) {
+                /*
+                 * Do this here since we don't call jbd2_journal_stop() in
+                 * no-journal mode.
+                 */
+                current->journal_info = NULL;
+                return 0;
+        }
        sb = handle->h_transaction->t_journal->j_private;
        err = handle->h_err;
        rc = jbd2_journal_stop(handle);
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);
+        BUG_ON(!ext4_handle_valid(handle));
        if (bh)
                BUFFER_TRACE(bh, "abort");
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function,
        va_end(args);
 }
+void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
+                                const char *function, const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+        va_list args;
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        va_start(args, fmt);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+        if (test_opt(sb, ERRORS_CONT)) {
+                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+                ext4_commit_super(sb, es, 0);
+                return;
+        }
+        ext4_unlock_group(sb, grp);
+        ext4_handle_error(sb);
+        /*
+         * We only get here in the ERRORS_RO case; relocking the group
+         * may be dangerous, but nothing bad will happen since the
+         * filesystem will have already been marked read/only and the
+         * journal has been aborted.  We return 1 as a hint to callers
+         * who might what to use the return value from
+         * ext4_grp_locked_error() to distinguish beween the
+         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+         * aggressively from the ext4 function in question, with a
+         * more appropriate error code.
+         */
+        ext4_lock_group(sb, grp);
+        return;
+}
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
        return bdev;
 fail:
-        printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+        printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
                        __bdevname(dev, b), PTR_ERR(bdev));
        return NULL;
 }
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb)
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
-        err = jbd2_journal_destroy(sbi->s_journal);
+        if (sbi->s_journal) {
-        sbi->s_journal = NULL;
+                err = jbd2_journal_destroy(sbi->s_journal);
-        if (err < 0)
+                sbi->s_journal = NULL;
-                ext4_abort(sb, __func__, "Couldn't clean up the journal");
+                if (err < 0)
+                        ext4_abort(sb, __func__,
+                                   "Couldn't clean up the journal");
+        }
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
+        /*
+         * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
+         * therefore it can be null here.  Don't check it, just initialize
+         * jinode.
+         */
        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode)
        }
 #endif
        ext4_discard_preallocations(inode);
-        jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+        if (EXT4_JOURNAL(inode))
+                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
                                       &EXT4_I(inode)->jinode);
 }
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
        if (!test_opt(sb, RESERVATION))
                seq_puts(seq, ",noreservation");
-        if (sbi->s_commit_interval) {
+        if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
        }
+        if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+                seq_printf(seq, ",min_batch_time=%u",
+                           (unsigned) sbi->s_min_batch_time);
+        }
+        if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+                seq_printf(seq, ",max_batch_time=%u",
+                           (unsigned) sbi->s_min_batch_time);
+        }
        /*
         * We're changing the default of barrier mount option, so
         * let's always display its mount state so it's clear what its
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",journal_async_commit");
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
-        if (!test_opt(sb, EXTENTS))
-                seq_puts(seq, ",noextents");
        if (test_opt(sb, I_VERSION))
                seq_puts(seq, ",i_version");
        if (!test_opt(sb, DELALLOC))
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
                                    ext4_nfs_get_inode);
 }
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+{
+        journal_t *journal = EXT4_SB(sb)->s_journal;
+        WARN_ON(PageChecked(page));
+        if (!page_has_buffers(page))
+                return 0;
+        if (journal)
+                return jbd2_journal_try_to_free_buffers(journal, page,
+                                                        wait & ~__GFP_WAIT);
+        return try_to_free_buffers(page);
+}
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -803,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = {
        .acquire_dquot  = ext4_acquire_dquot,
        .release_dquot  = ext4_release_dquot,
        .mark_dirty     = ext4_mark_dquot_dirty,
-        .write_info     = ext4_write_info
+        .write_info     = ext4_write_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
 static struct quotactl_ops ext4_qctl_operations = {
@@ -826,8 +978,8 @@ static const struct super_operations ext4_sops = {
        .put_super      = ext4_put_super,
        .write_super    = ext4_write_super,
        .sync_fs        = ext4_sync_fs,
-        .write_super_lockfs = ext4_write_super_lockfs,
+        .freeze_fs      = ext4_freeze,
-        .unlockfs       = ext4_unlockfs,
+        .unfreeze_fs    = ext4_unfreeze,
        .statfs         = ext4_statfs,
        .remount_fs     = ext4_remount,
        .clear_inode    = ext4_clear_inode,
@@ -836,6 +988,7 @@ static const struct super_operations ext4_sops = {
        .quota_read     = ext4_quota_read,
        .quota_write    = ext4_quota_write,
 #endif
+        .bdev_try_to_free_page = bdev_try_to_free_page,
 };
 static const struct export_operations ext4_export_ops = {
@@ -850,16 +1003,17 @@ enum {
        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+        Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+        Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
-        Opt_inode_readahead_blks
+        Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 static const match_table_t tokens = {
@@ -889,8 +1043,9 @@ static const match_table_t tokens = {
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
+        {Opt_min_batch_time, "min_batch_time=%u"},
+        {Opt_max_batch_time, "max_batch_time=%u"},
        {Opt_journal_update, "journal=update"},
-        {Opt_journal_inum, "journal=%u"},
        {Opt_journal_dev, "journal_dev=%u"},
        {Opt_journal_checksum, "journal_checksum"},
        {Opt_journal_async_commit, "journal_async_commit"},
@@ -911,14 +1066,13 @@ static const match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
-        {Opt_extents, "extents"},
-        {Opt_noextents, "noextents"},
        {Opt_i_version, "i_version"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+        {Opt_journal_ioprio, "journal_ioprio=%u"},
        {Opt_err, NULL},
 };
@@ -943,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data)
        return sb_block;
 }
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
 static int parse_options(char *options, struct super_block *sb,
-                         unsigned int *inum, unsigned long *journal_devnum,
+                         unsigned long *journal_devnum,
+                         unsigned int *journal_ioprio,
                         ext4_fsblk_t *n_blocks_count, int is_remount)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -956,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb,
        int qtype, qfmt;
        char *qname;
 #endif
-        ext4_fsblk_t last_block;
        if (!options)
                return 1;
@@ -1068,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
                        }
                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
                        break;
-                case Opt_journal_inum:
-                        if (is_remount) {
-                                printk(KERN_ERR "EXT4-fs: cannot specify "
-                                       "journal on remount\n");
-                                return 0;
-                        }
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        *inum = option;
-                        break;
                case Opt_journal_dev:
                        if (is_remount) {
                                printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -1107,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb,
                                option = JBD2_DEFAULT_MAX_COMMIT_AGE;
                        sbi->s_commit_interval = HZ * option;
                        break;
+                case Opt_max_batch_time:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0)
+                                return 0;
+                        if (option == 0)
+                                option = EXT4_DEF_MAX_BATCH_TIME;
+                        sbi->s_max_batch_time = option;
+                        break;
+                case Opt_min_batch_time:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_min_batch_time = option;
+                        break;
                case Opt_data_journal:
                        data_opt = EXT4_MOUNT_JOURNAL_DATA;
                        goto datacheck;
@@ -1142,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_grpjquota:
                        qtype = GRPQUOTA;
 set_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
                                       "EXT4-fs: Cannot change journaled "
@@ -1182,8 +1343,7 @@ set_qf_name:
                case Opt_offgrpjquota:
                        qtype = GRPQUOTA;
 clear_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR "EXT4-fs: Cannot change "
                                        "journaled quota options when "
@@ -1202,8 +1362,7 @@ clear_qf_name:
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
 set_qf_format:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_jquota_fmt != qfmt) {
                                printk(KERN_ERR "EXT4-fs: Cannot change "
                                        "journaled quota options when "
@@ -1222,7 +1381,7 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
-                        if (sb_any_quota_enabled(sb)) {
+                        if (sb_any_quota_loaded(sb)) {
                                printk(KERN_ERR "EXT4-fs: Cannot change quota "
                                        "options when quota turned on.\n");
                                return 0;
@@ -1280,33 +1439,6 @@ set_qf_format:
                case Opt_bh:
                        clear_opt(sbi->s_mount_opt, NOBH);
                        break;
-                case Opt_extents:
-                        if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
-                                        EXT4_FEATURE_INCOMPAT_EXTENTS)) {
-                                ext4_warning(sb, __func__,
-                                        "extents feature not enabled "
-                                        "on this filesystem, use tune2fs\n");
-                                return 0;
-                        }
-                        set_opt(sbi->s_mount_opt, EXTENTS);
-                        break;
-                case Opt_noextents:
-                        /*
-                         * When e2fsprogs support resizing an already existing
-                         * ext3 file system to greater than 2**32 we need to
-                         * add support to block allocator to handle growing
-                         * already existing block  mapped inode so that blocks
-                         * allocated for them fall within 2**32
-                         */
-                        last_block = ext4_blocks_count(sbi->s_es) - 1;
-                        if (last_block  > 0xffffffffULL) {
-                                printk(KERN_ERR "EXT4-fs: Filesystem too "
-                                                "large to mount with "
-                                                "-o noextents options\n");
-                                return 0;
-                        }
-                        clear_opt(sbi->s_mount_opt, EXTENTS);
-                        break;
                case Opt_i_version:
                        set_opt(sbi->s_mount_opt, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
@@ -1331,6 +1463,14 @@ set_qf_format:
                                return 0;
                        sbi->s_inode_readahead_blks = option;
                        break;
+                case Opt_journal_ioprio:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0 || option > 7)
+                                break;
+                        *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
+                                                            option);
+                        break;
                default:
                        printk(KERN_ERR
                               "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1406,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                printk(KERN_WARNING
                       "EXT4-fs warning: checktime reached, "
                       "running e2fsck is recommended\n");
-#if 0
+        if (!sbi->s_journal) 
-                /* @@@ We _will_ want to clear the valid bit if we find
+                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
-                 * inconsistencies, to force a fsck at reboot.  But for
-                 * a plain journaled filesystem we can keep it set as
-                 * valid forever! :)
-                 */
-        es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
-#endif
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        es->s_mtime = cpu_to_le32(get_seconds());
        ext4_update_dynamic_rev(sb);
-        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        if (sbi->s_journal)
+                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        ext4_commit_super(sb, es, 1);
        if (test_opt(sb, DEBUG))
-                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
+                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
@@ -1431,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt);
-        printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+        if (EXT4_SB(sb)->s_journal) {
-               sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+                printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-               "external", EXT4_SB(sb)->s_journal->j_devname);
+                       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+                       "external", EXT4_SB(sb)->s_journal->j_devname);
+        } else {
+                printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+        }
        return res;
 }
@@ -1445,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
        ext4_group_t flex_group_count;
        ext4_group_t flex_group;
        int groups_per_flex = 0;
-        __u64 block_bitmap = 0;
        int i;
        if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1464,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
                                     sizeof(struct flex_groups), GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
                printk(KERN_ERR "EXT4-fs: not enough memory for "
-                                "%lu flex groups\n", flex_group_count);
+                                "%u flex groups\n", flex_group_count);
                goto failed;
        }
-        gdp = ext4_get_group_desc(sb, 1, &bh);
-        block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
        for (i = 0; i < sbi->s_groups_count; i++) {
                gdp = ext4_get_group_desc(sb, i, &bh);
                flex_group = ext4_flex_group(sbi, i);
                sbi->s_flex_groups[flex_group].free_inodes +=
-                        le16_to_cpu(gdp->bg_free_inodes_count);
+                        ext4_free_inodes_count(sb, gdp);
                sbi->s_flex_groups[flex_group].free_blocks +=
-                        le16_to_cpu(gdp->bg_free_blocks_count);
+                        ext4_free_blks_count(sb, gdp);
        }
        return 1;
@@ -1552,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb)
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                               "Block bitmap for group %lu not in group "
+                               "Block bitmap for group %u not in group "
                               "(block %llu)!\n", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                               "Inode bitmap for group %lu not in group "
+                               "Inode bitmap for group %u not in group "
                               "(block %llu)!\n", i, inode_bitmap);
                        return 0;
                }
@@ -1567,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb)
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                               "Inode table for group %lu not in group "
+                               "Inode table for group %u not in group "
                               "(block %llu)!\n", i, inode_table);
                        return 0;
                }
                spin_lock(sb_bgl_lock(sbi, i));
                if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                               "Checksum for group %lu failed (%u!=%u)\n",
+                               "Checksum for group %u failed (%u!=%u)\n",
                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
                               gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!(sb->s_flags & MS_RDONLY)) {
@@ -1866,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        ext4_fsblk_t sb_block = get_sb_block(&data);
        ext4_fsblk_t logical_sb_block;
        unsigned long offset = 0;
-        unsigned int journal_inum = 0;
        unsigned long journal_devnum = 0;
        unsigned long def_mount_opts;
        struct inode *root;
        char *cp;
+        const char *descr;
        int ret = -EINVAL;
        int blocksize;
-        int db_count;
+        unsigned int db_count;
-        int i;
+        unsigned int i;
        int needs_recovery, has_huge_files;
-        __le32 features;
+        int features;
        __u64 blocks_count;
        int err;
+        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
@@ -1959,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
        set_opt(sbi->s_mount_opt, RESERVATION);
        set_opt(sbi->s_mount_opt, BARRIER);
        /*
-         * turn on extents feature by default in ext4 filesystem
-         * only if feature flag already set by mkfs or tune2fs.
-         * Use -o noextents to turn it off
-         */
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
-                set_opt(sbi->s_mount_opt, EXTENTS);
-        else
-                ext4_warning(sb, __func__,
-                        "extents feature not enabled on this filesystem, "
-                        "use tune2fs.\n");
-        /*
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
        set_opt(sbi->s_mount_opt, DELALLOC);
-        if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
+        if (!parse_options((char *) data, sb, &journal_devnum,
-                           NULL, 0))
+                           &journal_ioprio, NULL, 0))
                goto failed_mount;
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2005,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
        if (features) {
                printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-                       "unsupported optional features (%x).\n",
+                       "unsupported optional features (%x).\n", sb->s_id,
-                       sb->s_id, le32_to_cpu(features));
+                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                goto failed_mount;
        }
        features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
        if (!(sb->s_flags & MS_RDONLY) && features) {
                printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-                       "unsupported optional features (%x).\n",
+                       "unsupported optional features (%x).\n", sb->s_id,
-                       sb->s_id, le32_to_cpu(features));
+                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                goto failed_mount;
        }
        has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2118,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
+        i = le32_to_cpu(es->s_flags);
+        if (i & EXT2_FLAGS_UNSIGNED_HASH)
+                sbi->s_hash_unsigned = 3;
+        else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+                es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+                sbi->s_hash_unsigned = 3;
+#else
+                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+                sb->s_dirt = 1;
+        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
                printk(KERN_ERR
@@ -2145,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
-        /* ensure blocks_count calculation below doesn't sign-extend */
+        /*
-        if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
+         * It makes no sense for the first data block to be beyond the end
-            le32_to_cpu(es->s_first_data_block) + 1) {
+         * of the filesystem.
-                printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
+         */
-                       "first data block %u, blocks per group %lu\n",
+        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                        ext4_blocks_count(es),
+                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
-                        le32_to_cpu(es->s_first_data_block),
+                       "block %u is beyond end of filesystem (%llu)\n",
-                        EXT4_BLOCKS_PER_GROUP(sb));
+                       le32_to_cpu(es->s_first_data_block),
+                       ext4_blocks_count(es));
                goto failed_mount;
        }
        blocks_count = (ext4_blocks_count(es) -
                        le32_to_cpu(es->s_first_data_block) +
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+                printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+                       "(block count %llu, first data block %u, "
+                       "blocks per group %lu)\n", sbi->s_groups_count,
+                       ext4_blocks_count(es),
+                       le32_to_cpu(es->s_first_data_block),
+                       EXT4_BLOCKS_PER_GROUP(sb));
+                goto failed_mount;
+        }
        sbi->s_groups_count = blocks_count;
        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
@@ -2270,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                                ext4_commit_super(sb, es, 1);
-                                printk(KERN_CRIT
-                                       "EXT4-fs (device %s): mount failed\n",
-                                      sb->s_id);
                                goto failed_mount4;
                        }
                }
-        } else if (journal_inum) {
+        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
-                if (ext4_create_journal(sb, es, journal_inum))
+              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
-                        goto failed_mount3;
+                printk(KERN_ERR "EXT4-fs: required journal recovery "
+                       "suppressed and not mounted read-only\n");
+                goto failed_mount4;
        } else {
-                if (!silent)
+                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
-                        printk(KERN_ERR
+                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
-                               "ext4: No journal on filesystem on %s\n",
+                sbi->s_journal = NULL;
-                               sb->s_id);
+                needs_recovery = 0;
-                goto failed_mount3;
+                goto no_journal;
        }
        if (ext4_blocks_count(es) > 0xffffffffULL &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
-                printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
+                printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
                goto failed_mount4;
        }
@@ -2335,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        default:
                break;
        }
+        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+no_journal:
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
@@ -2420,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
-        if (needs_recovery)
+        if (needs_recovery) {
                printk(KERN_INFO "EXT4-fs: recovery complete.\n");
-        ext4_mark_recovery_complete(sb, es);
+                ext4_mark_recovery_complete(sb, es);
-        printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
+        }
-               test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
+        if (EXT4_SB(sb)->s_journal) {
-               test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
+                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-               "writeback");
+                        descr = " journalled data mode";
+                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+                        descr = " ordered data mode";
+                else
+                        descr = " writeback data mode";
+        } else
+                descr = "out journal";
+        printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
+               sb->s_id, descr);
        lock_kernel();
        return 0;
@@ -2438,8 +2600,11 @@ cantfind_ext4:
        goto failed_mount;
 failed_mount4:
-        jbd2_journal_destroy(sbi->s_journal);
+        printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
-        sbi->s_journal = NULL;
+        if (sbi->s_journal) {
+                jbd2_journal_destroy(sbi->s_journal);
+                sbi->s_journal = NULL;
+        }
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2476,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (sbi->s_commit_interval)
+        journal->j_commit_interval = sbi->s_commit_interval;
-                journal->j_commit_interval = sbi->s_commit_interval;
+        journal->j_min_batch_time = sbi->s_min_batch_time;
-        /* We could also set up an ext4-specific default for the commit
+        journal->j_max_batch_time = sbi->s_max_batch_time;
-         * interval here, but for now we'll just fall back to the jbd
-         * default. */
        spin_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
@@ -2500,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
        struct inode *journal_inode;
        journal_t *journal;
+        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
        /* First, test for the existence of a valid inode on disk.  Bad
         * things happen if we iget() an unused inode, as the subsequent
         * iput() will try to delete it. */
@@ -2548,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        struct ext4_super_block *es;
        struct block_device *bdev;
+        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
        bdev = ext4_blkdev_get(j_dev);
        if (bdev == NULL)
                return NULL;
        if (bd_claim(bdev, sb)) {
                printk(KERN_ERR
-                        "EXT4: failed to claim external journal device.\n");
+                        "EXT4-fs: failed to claim external journal device.\n");
                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
                return NULL;
        }
@@ -2635,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb,
        int err = 0;
        int really_read_only;
+        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2719,55 +2888,14 @@ static int ext4_load_journal(struct super_block *sb,
        return 0;
 }
-static int ext4_create_journal(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb,
-                               struct ext4_super_block *es,
-                               unsigned int journal_inum)
-{
-        journal_t *journal;
-        int err;
-        if (sb->s_flags & MS_RDONLY) {
-                printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
-                                "create journal.\n");
-                return -EROFS;
-        }
-        journal = ext4_get_journal(sb, journal_inum);
-        if (!journal)
-                return -EINVAL;
-        printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
-               journal_inum);
-        err = jbd2_journal_create(journal);
-        if (err) {
-                printk(KERN_ERR "EXT4-fs: error creating journal.\n");
-                jbd2_journal_destroy(journal);
-                return -EIO;
-        }
-        EXT4_SB(sb)->s_journal = journal;
-        ext4_update_dynamic_rev(sb);
-        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-        EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
-        es->s_journal_inum = cpu_to_le32(journal_inum);
-        sb->s_dirt = 1;
-        /* Make sure we flush the recovery flag to disk. */
-        ext4_commit_super(sb, es, 1);
-        return 0;
-}
-static void ext4_commit_super(struct super_block *sb,
                              struct ext4_super_block *es, int sync)
 {
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+        int error = 0;
        if (!sbh)
-                return;
+                return error;
        if (buffer_write_io_error(sbh)) {
                /*
                 * Oh, dear.  A previous attempt to write the
@@ -2777,25 +2905,33 @@ static void ext4_commit_super(struct super_block *sb,
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-                printk(KERN_ERR "ext4: previous I/O error to "
+                printk(KERN_ERR "EXT4-fs: previous I/O error to "
                       "superblock detected for %s.\n", sb->s_id);
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
        es->s_wtime = cpu_to_le32(get_seconds());
-        ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
+        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-        es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+                                        &EXT4_SB(sb)->s_freeblocks_counter));
+        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+                                        &EXT4_SB(sb)->s_freeinodes_counter));
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
        if (sync) {
-                sync_dirty_buffer(sbh);
+                error = sync_dirty_buffer(sbh);
-                if (buffer_write_io_error(sbh)) {
+                if (error)
-                        printk(KERN_ERR "ext4: I/O error while writing "
+                        return error;
+                error = buffer_write_io_error(sbh);
+                if (error) {
+                        printk(KERN_ERR "EXT4-fs: I/O error while writing "
                               "superblock for %s.\n", sb->s_id);
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }
        }
+        return error;
 }
@@ -2809,6 +2945,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 {
        journal_t *journal = EXT4_SB(sb)->s_journal;
+        if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+                BUG_ON(journal != NULL);
+                return;
+        }
        jbd2_journal_lock_updates(journal);
        if (jbd2_journal_flush(journal) < 0)
                goto out;
@@ -2838,6 +2978,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
        int j_errno;
        const char *errstr;
+        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
        journal = EXT4_SB(sb)->s_journal;
        /*
@@ -2870,14 +3012,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
 int ext4_force_commit(struct super_block *sb)
 {
        journal_t *journal;
-        int ret;
+        int ret = 0;
        if (sb->s_flags & MS_RDONLY)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        sb->s_dirt = 0;
+        if (journal) {
-        ret = ext4_journal_force_commit(journal);
+                sb->s_dirt = 0;
+                ret = ext4_journal_force_commit(journal);
+        }
        return ret;
 }
@@ -2889,9 +3034,13 @@ int ext4_force_commit(struct super_block *sb)
 */
 static void ext4_write_super(struct super_block *sb)
 {
-        if (mutex_trylock(&sb->s_lock) != 0)
+        if (EXT4_SB(sb)->s_journal) {
-                BUG();
+                if (mutex_trylock(&sb->s_lock) != 0)
-        sb->s_dirt = 0;
+                        BUG();
+                sb->s_dirt = 0;
+        } else {
+                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+        }
 }
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2900,10 +3049,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
        sb->s_dirt = 0;
-        if (wait)
+        if (EXT4_SB(sb)->s_journal) {
-                ret = ext4_force_commit(sb);
+                if (wait)
-        else
+                        ret = ext4_force_commit(sb);
-                jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+                else
+                        jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+        } else {
+                ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+        }
        return ret;
 }
@@ -2911,36 +3064,48 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
 */
-static void ext4_write_super_lockfs(struct super_block *sb)
+static int ext4_freeze(struct super_block *sb)
 {
+        int error = 0;
+        journal_t *journal;
        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY)) {
-                journal_t *journal = EXT4_SB(sb)->s_journal;
+                journal = EXT4_SB(sb)->s_journal;
-                /* Now we set up the journal barrier. */
+                if (journal) {
-                jbd2_journal_lock_updates(journal);
+                        /* Now we set up the journal barrier. */
+                        jbd2_journal_lock_updates(journal);
-                /*
+                        /*
-                 * We don't want to clear needs_recovery flag when we failed
+                         * We don't want to clear needs_recovery flag when we
-                 * to flush the journal.
+                         * failed to flush the journal.
-                 */
+                         */
-                if (jbd2_journal_flush(journal) < 0)
+                        error = jbd2_journal_flush(journal);
-                        return;
+                        if (error < 0)
+                                goto out;
+                }
                /* Journal blocked and flushed, clear needs_recovery flag. */
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+                error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+                if (error)
+                        goto out;
        }
+        return 0;
+out:
+        jbd2_journal_unlock_updates(journal);
+        return error;
 }
 /*
 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
 * flag here, even though the filesystem is not technically dirty yet.
 */
-static void ext4_unlockfs(struct super_block *sb)
+static int ext4_unfreeze(struct super_block *sb)
 {
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
                lock_super(sb);
                /* Reser the needs_recovery flag before the fs is unlocked. */
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2948,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb)
                unlock_super(sb);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        }
+        return 0;
 }
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
@@ -2958,6 +3124,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
        ext4_group_t g;
+        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
@@ -2969,16 +3136,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
+        old_opts.s_min_batch_time = sbi->s_min_batch_time;
+        old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
        for (i = 0; i < MAXQUOTAS; i++)
                old_opts.s_qf_names[i] = sbi->s_qf_names[i];
 #endif
+        if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+                journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
        /*
         * Allow the "check" option to be passed as a remount option.
         */
-        if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+        if (!parse_options(data, sb, NULL, &journal_ioprio,
+                           &n_blocks_count, 1)) {
                err = -EINVAL;
                goto restore_opts;
        }
@@ -2991,7 +3163,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        es = sbi->s_es;
-        ext4_init_journal_params(sb, sbi->s_journal);
+        if (sbi->s_journal) {
+                ext4_init_journal_params(sb, sbi->s_journal);
+                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+        }
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
                n_blocks_count > ext4_blocks_count(es)) {
@@ -3020,17 +3195,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * We have to unlock super so that we can wait for
                         * transactions.
                         */
-                        unlock_super(sb);
+                        if (sbi->s_journal) {
-                        ext4_mark_recovery_complete(sb, es);
+                                unlock_super(sb);
-                        lock_super(sb);
+                                ext4_mark_recovery_complete(sb, es);
+                                lock_super(sb);
+                        }
                } else {
-                        __le32 ret;
+                        int ret;
                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
                                       "remount RDWR because of unsupported "
-                                       "optional features (%x).\n",
+                                       "optional features (%x).\n", sb->s_id,
-                                       sb->s_id, le32_to_cpu(ret));
+                                (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
+                                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                                err = -EROFS;
                                goto restore_opts;
                        }
@@ -3047,7 +3225,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
                                        printk(KERN_ERR
               "EXT4-fs: ext4_remount: "
-                "Checksum for group %lu failed (%u!=%u)\n",
+                "Checksum for group %u failed (%u!=%u)\n",
                g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EINVAL;
@@ -3076,7 +3254,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * been changed by e2fsck since we originally mounted
                         * the partition.)
                         */
-                        ext4_clear_journal_err(sb, es);
+                        if (sbi->s_journal)
+                                ext4_clear_journal_err(sb, es);
                        sbi->s_mount_state = le16_to_cpu(es->s_state);
                        if ((err = ext4_group_extend(sb, es, n_blocks_count)))
                                goto restore_opts;
@@ -3084,6 +3263,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                sb->s_flags &= ~MS_RDONLY;
                }
        }
+        if (sbi->s_journal == NULL)
+                ext4_commit_super(sb, es, 1);
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
        for (i = 0; i < MAXQUOTAS; i++)
@@ -3098,6 +3280,8 @@ restore_opts:
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
+        sbi->s_min_batch_time = old_opts.s_min_batch_time;
+        sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
        for (i = 0; i < MAXQUOTAS; i++) {
@@ -3360,7 +3544,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
         * When we journal data on quota file, we have to flush journal to see
         * all updates to the file when we bypass pagecache...
         */
-        if (ext4_should_journal_data(path.dentry->d_inode)) {
+        if (EXT4_SB(sb)->s_journal &&
+            ext4_should_journal_data(path.dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -3434,7 +3619,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
-        if (!handle) {
+        if (EXT4_SB(sb)->s_journal && !handle) {
                printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
                        " cancelled because transaction is not started.\n",
                        (unsigned long long)off, (unsigned long long)len);
@@ -3459,7 +3644,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                flush_dcache_page(bh->b_page);
                unlock_buffer(bh);
                if (journal_quota)
-                        err = ext4_journal_dirty_metadata(handle, bh);
+                        err = ext4_handle_dirty_metadata(handle, NULL, bh);
                else {
                        /* Always do at least ordered writes for quotas */
                        err = ext4_jbd2_file_inode(handle, inode);
@@ -3513,18 +3698,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
 static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
                               size_t cnt, loff_t *ppos)
 {
-        unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+        unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
        char str[32];
-        unsigned long value;
        if (cnt >= sizeof(str))
                return -EINVAL;
        if (copy_from_user(str, buf, cnt))
                return -EFAULT;
-        value = simple_strtol(str, NULL, 0);
-        if (value < 0)
+        *p = simple_strtoul(str, NULL, 0);
-                return -ERANGE;
-        *p = value;
        return cnt;
 }
@@ -3615,7 +3797,7 @@ static void __exit exit_ext4_fs(void)
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
 module_init(init_ext4_fs)
 module_exit(exit_ext4_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 80626d516fee..157ce6589c54 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
        if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
                EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
                sb->s_dirt = 1;
-                ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+                ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        }
 }
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
        } else {
                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
-                error = ext4_journal_dirty_metadata(handle, bh);
+                error = ext4_handle_dirty_metadata(handle, inode, bh);
                if (IS_SYNC(inode))
-                        handle->h_sync = 1;
+                        ext4_handle_sync(handle);
                DQUOT_FREE_BLOCK(inode, 1);
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                        if (error == -EIO)
                                goto bad_block;
                        if (!error)
-                                error = ext4_journal_dirty_metadata(handle,
+                                error = ext4_handle_dirty_metadata(handle,
-                                                                    bs->bh);
+                                                                   inode,
+                                                                   bs->bh);
                        if (error)
                                goto cleanup;
                        goto inserted;
@@ -794,8 +795,9 @@ inserted:
                                ea_bdebug(new_bh, "reusing; refcount now=%d",
                                        le32_to_cpu(BHDR(new_bh)->h_refcount));
                                unlock_buffer(new_bh);
-                                error = ext4_journal_dirty_metadata(handle,
+                                error = ext4_handle_dirty_metadata(handle,
-                                                                    new_bh);
+                                                                   inode,
+                                                                   new_bh);
                                if (error)
                                        goto cleanup_dquot;
                        }
@@ -810,8 +812,8 @@ inserted:
                        /* We need to allocate a new block */
                        ext4_fsblk_t goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
-                        ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
+                        ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
-                                                        goal, &error);
+                                                  goal, NULL, &error);
                        if (error)
                                goto cleanup;
                        ea_idebug(inode, "creating block %d", block);
@@ -833,7 +835,8 @@ getblk_failed:
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
                        ext4_xattr_cache_insert(new_bh);
-                        error = ext4_journal_dirty_metadata(handle, new_bh);
+                        error = ext4_handle_dirty_metadata(handle,
+                                                           inode, new_bh);
                        if (error)
                                goto cleanup;
                }
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                 */
                is.iloc.bh = NULL;
                if (IS_SYNC(inode))
-                        handle->h_sync = 1;
+                        ext4_handle_sync(handle);
        }
 cleanup:
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
new file mode 100644
index 000000000000..d0a69ff25375
--- /dev/null
+++ b/fs/fat/Kconfig
@@ -0,0 +1,97 @@
+config FAT_FS
+        tristate
+        select NLS
+        help
+          If you want to use one of the FAT-based file systems (the MS-DOS and
+          VFAT (Windows 95) file systems), then you must say Y or M here
+          to include FAT support. You will then be able to mount partitions or
+          diskettes with FAT-based file systems and transparently access the
+          files on them, i.e. MSDOS files will look and behave just like all
+          other Unix files.
+          This FAT support is not a file system in itself, it only provides
+          the foundation for the other file systems. You will have to say Y or
+          M to at least one of "MSDOS fs support" or "VFAT fs support" in
+          order to make use of it.
+          Another way to read and write MSDOS floppies and hard drive
+          partitions from within Linux (but not transparently) is with the
+          mtools ("man mtools") program suite. You don't need to say Y here in
+          order to do that.
+          If you need to move large files on floppies between a DOS and a
+          Linux box, say Y here, mount the floppy under Linux with an MSDOS
+          file system and use GNU tar's M option. GNU tar is a program
+          available for Unix and DOS ("man tar" or "info tar").
+          The FAT support will enlarge your kernel by about 37 KB. If unsure,
+          say Y.
+          To compile this as a module, choose M here: the module will be called
+          fat.  Note that if you compile the FAT support as a module, you
+          cannot compile any of the FAT-based file systems into the kernel
+          -- they will have to be modules as well.
+config MSDOS_FS
+        tristate "MSDOS fs support"
+        select FAT_FS
+        help
+          This allows you to mount MSDOS partitions of your hard drive (unless
+          they are compressed; to access compressed MSDOS partitions under
+          Linux, you can either use the DOS emulator DOSEMU, described in the
+          DOSEMU-HOWTO, available from
+          <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
+          <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
+          intend to use dosemu with a non-compressed MSDOS partition, say Y
+          here) and MSDOS floppies. This means that file access becomes
+          transparent, i.e. the MSDOS files look and behave just like all
+          other Unix files.
+          If you have Windows 95 or Windows NT installed on your MSDOS
+          partitions, you should use the VFAT file system (say Y to "VFAT fs
+          support" below), or you will not be able to see the long filenames
+          generated by Windows 95 / Windows NT.
+          This option will enlarge your kernel by about 7 KB. If unsure,
+          answer Y. This will only work if you said Y to "DOS FAT fs support"
+          as well. To compile this as a module, choose M here: the module will
+          be called msdos.
+config VFAT_FS
+        tristate "VFAT (Windows-95) fs support"
+        select FAT_FS
+        help
+          This option provides support for normal Windows file systems with
+          long filenames.  That includes non-compressed FAT-based file systems
+          used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
+          programs from the mtools package.
+          The VFAT support enlarges your kernel by about 10 KB and it only
+          works if you said Y to the "DOS FAT fs support" above.  Please read
+          the file <file:Documentation/filesystems/vfat.txt> for details.  If
+          unsure, say Y.
+          To compile this as a module, choose M here: the module will be called
+          vfat.
+config FAT_DEFAULT_CODEPAGE
+        int "Default codepage for FAT"
+        depends on MSDOS_FS || VFAT_FS
+        default 437
+        help
+          This option should be set to the codepage of your FAT filesystems.
+          It can be overridden with the "codepage" mount option.
+          See <file:Documentation/filesystems/vfat.txt> for more information.
+config FAT_DEFAULT_IOCHARSET
+        string "Default iocharset for FAT"
+        depends on VFAT_FS
+        default "iso8859-1"
+        help
+          Set this to the default input/output character set you'd
+          like FAT to use. It should probably match the character set
+          that most of your FAT filesystems use, and can be overridden
+          with the "iocharset" mount option for FAT filesystems.
+          Note that "utf8" is not recommended for FAT filesystems.
+          If unsure, you shouldn't set "utf8" here.
+          See <file:Documentation/filesystems/vfat.txt> for more information.
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 67e058357098..3a7f603b6982 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -841,7 +841,6 @@ const struct file_operations fat_dir_operations = {
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
        .fsync          = file_fsync,
-        .llseek         = generic_file_llseek,
 };
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d937aaf77374..6b74d09adbe5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -749,6 +749,8 @@ static struct dentry *fat_get_parent(struct dentry *child)
        brelse(bh);
        parent = d_obtain_alias(inode);
+        if (!IS_ERR(parent))
+                parent->d_op = sb->s_root->d_op;
 out:
        unlock_super(sb);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bf326d4356a3..8ae32e37673c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -78,7 +78,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
         * for creation.
         */
        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
-                if (nd->flags & LOOKUP_CREATE)
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
                        return 0;
        }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cdc141946724..bd215cc791da 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -50,7 +50,7 @@ static int get_close_on_exec(unsigned int fd)
        return res;
 }
-asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
+SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
 {
        int err = -EBADF;
        struct file * file, *tofree;
@@ -113,7 +113,7 @@ out_unlock:
        return err;
 }
-asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
        if (unlikely(newfd == oldfd)) { /* corner case */
                struct files_struct *files = current->files;
@@ -126,7 +126,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
        return sys_dup3(oldfd, newfd, 0);
 }
-asmlinkage long sys_dup(unsigned int fildes)
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
        int ret = -EBADF;
        struct file *file = fget(fildes);
@@ -335,7 +335,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        return err;
 }
-asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
+SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {       
        struct file *filp;
        long err = -EBADF;
@@ -358,7 +358,8 @@ out:
 }
 #if BITS_PER_LONG == 32
-asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg)
+SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+                unsigned long, arg)
 {       
        struct file * filp;
        long err;
diff --git a/fs/file_table.c b/fs/file_table.c
index 55895ccc08c6..da806aceae3f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -33,6 +33,9 @@ struct files_stat_struct files_stat = {
 /* public. Not pretty! */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+/* SLAB cache for file structures */
+static struct kmem_cache *filp_cachep __read_mostly;
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 static inline void file_free_rcu(struct rcu_head *head)
@@ -399,7 +402,12 @@ too_bad:
 void __init files_init(unsigned long mempages)
 { 
        int n; 
-        /* One file with associated inode and dcache is very roughly 1K. 
+        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+                        SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+        /*
+         * One file with associated inode and dcache is very roughly 1K.
         * Per default don't use more than 10% of our memory for files. 
         */ 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62dd..1aa70260e6d1 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -179,7 +179,7 @@ static int fs_maxindex(void)
 /*
 * Whee.. Weird sysv syscall. 
 */
-asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
+SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
 {
        int retval = -EINVAL;
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
 module_init(proc_filesystems_init);
 #endif
-struct file_system_type *get_fs_type(const char *name)
+static struct file_system_type *__get_fs_type(const char *name, int len)
 {
        struct file_system_type *fs;
-        const char *dot = strchr(name, '.');
-        unsigned len = dot ? dot - name : strlen(name);
        read_lock(&file_systems_lock);
        fs = *(find_filesystem(name, len));
        if (fs && !try_module_get(fs->owner))
                fs = NULL;
        read_unlock(&file_systems_lock);
-        if (!fs && (request_module("%.*s", len, name) == 0)) {
+        return fs;
-                read_lock(&file_systems_lock);
+}
-                fs = *(find_filesystem(name, len));
-                if (fs && !try_module_get(fs->owner))
+struct file_system_type *get_fs_type(const char *name)
-                        fs = NULL;
+{
-                read_unlock(&file_systems_lock);
+        struct file_system_type *fs;
-        }
+        const char *dot = strchr(name, '.');
+        int len = dot ? dot - name : strlen(name);
+        fs = __get_fs_type(name, len);
+        if (!fs && (request_module("%.*s", len, name) == 0))
+                fs = __get_fs_type(name, len);
        if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
                put_filesystem(fs);
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
new file mode 100644
index 000000000000..8dc1cd5c1efe
--- /dev/null
+++ b/fs/freevxfs/Kconfig
@@ -0,0 +1,16 @@
+config VXFS_FS
+        tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
+        depends on BLOCK
+        help
+          FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
+          file system format.  VERITAS VxFS(TM) is the standard file system
+          of SCO UnixWare (and possibly others) and optionally available
+          for Sunsoft Solaris, HP-UX and many other operating systems.
+          Currently only readonly access is supported.
+          NOTE: the file system type as used by mount(1), mount(2) and
+          fstab(5) is 'vxfs' as it describes the file system format, not
+          the actual driver.
+          To compile this as a module, choose M here: the module will be
+          called freevxfs.  If unsure, say N.
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 9f3f2ceb73f0..03a6ea5e99f7 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
                if (!VXFS_ISIMMED(vip)) {
                        ip->i_op = &page_symlink_inode_operations;
                        ip->i_mapping->a_ops = &vxfs_aops;
-                } else
+                } else {
                        ip->i_op = &vxfs_immed_symlink_iops;
+                        vip->vii_immed.vi_immed[ip->i_size] = '\0';
+                }
        } else
                init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0ff0b8cf309..e5eaa62fd17f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * If we're a pdlfush thread, then implement pdflush collision avoidance
 * against the entire list.
 *
- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
- * that it can be located for waiting on in __writeback_single_inode().
- *
 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
 * This function assumes that the blockdev superblock's inodes are backed by
 * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
                                struct writeback_control *wbc)
 {
        const unsigned long start = jiffies;    /* livelock avoidance */
+        int sync = wbc->sync_mode == WB_SYNC_ALL;
        spin_lock(&inode_lock);
        if (!wbc->for_kupdate || list_empty(&sb->s_io))
@@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                __writeback_single_inode(inode, wbc);
-                if (wbc->sync_mode == WB_SYNC_HOLD) {
-                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_list, &sb->s_dirty);
-                }
                if (current_is_pdflush())
                        writeback_release(bdi);
                if (wbc->pages_skipped != pages_skipped) {
@@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb,
                if (!list_empty(&sb->s_more_io))
                        wbc->more_io = 1;
        }
-        spin_unlock(&inode_lock);
+        if (sync) {
+                struct inode *inode, *old_inode = NULL;
+                /*
+                 * Data integrity sync. Must wait for all pages under writeback,
+                 * because there may have been pages dirtied before our sync
+                 * call, but which had writeout started before we write it out.
+                 * In which case, the inode may not be on the dirty list, but
+                 * we still have to wait for that writeout.
+                 */
+                list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+                        struct address_space *mapping;
+                        if (inode->i_state & (I_FREEING|I_WILL_FREE))
+                                continue;
+                        mapping = inode->i_mapping;
+                        if (mapping->nrpages == 0)
+                                continue;
+                        __iget(inode);
+                        spin_unlock(&inode_lock);
+                        /*
+                         * We hold a reference to 'inode' so it couldn't have
+                         * been removed from s_inodes list while we dropped the
+                         * inode_lock.  We cannot iput the inode now as we can
+                         * be holding the last reference and we cannot iput it
+                         * under inode_lock. So we keep the reference and iput
+                         * it later.
+                         */
+                        iput(old_inode);
+                        old_inode = inode;
+                        filemap_fdatawait(mapping);
+                        cond_resched();
+                        spin_lock(&inode_lock);
+                }
+                spin_unlock(&inode_lock);
+                iput(old_inode);
+        } else
+                spin_unlock(&inode_lock);
        return;         /* Leave any unwritten inodes on s_io */
 }
 EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -588,8 +624,7 @@ restart:
 /*
 * writeback and wait upon the filesystem's dirty inodes.  The caller will
- * do this in two passes - one to write, and one to wait.  WB_SYNC_HOLD is
+ * do this in two passes - one to write, and one to wait.
- * used to park the written inodes on sb->s_dirty for the wait pass.
 *
 * A finite limit is set on the number of pages which will be written.
 * To prevent infinite livelock of sys_sync().
@@ -600,30 +635,21 @@ restart:
 void sync_inodes_sb(struct super_block *sb, int wait)
 {
        struct writeback_control wbc = {
-                .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+                .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
                .range_start    = 0,
                .range_end      = LLONG_MAX,
        };
-        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        wbc.nr_to_write = nr_dirty + nr_unstable +
+        if (!wait) {
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
+                unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-                        nr_dirty + nr_unstable;
+                unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        wbc.nr_to_write += wbc.nr_to_write / 2;         /* Bit more for luck */
-        sync_sb_inodes(sb, &wbc);
-}
-/*
+                wbc.nr_to_write = nr_dirty + nr_unstable +
- * Rather lame livelock avoidance.
+                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- */
+        } else
-static void set_sb_syncing(int val)
+                wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
-{
-        struct super_block *sb;
+        sync_sb_inodes(sb, &wbc);
-        spin_lock(&sb_lock);
-        list_for_each_entry_reverse(sb, &super_blocks, s_list)
-                sb->s_syncing = val;
-        spin_unlock(&sb_lock);
 }
 /**
@@ -652,9 +678,6 @@ static void __sync_inodes(int wait)
        spin_lock(&sb_lock);
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_syncing)
-                        continue;
-                sb->s_syncing = 1;
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
@@ -672,13 +695,10 @@ restart:
 void sync_inodes(int wait)
 {
-        set_sb_syncing(0);
        __sync_inodes(0);
-        if (wait) {
+        if (wait)
-                set_sb_syncing(0);
                __sync_inodes(1);
-        }
 }
 /**
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
new file mode 100644
index 000000000000..0cf160a94eda
--- /dev/null
+++ b/fs/fuse/Kconfig
@@ -0,0 +1,15 @@
+config FUSE_FS
+        tristate "FUSE (Filesystem in Userspace) support"
+        help
+          With FUSE it is possible to implement a fully functional filesystem
+          in a userspace program.
+          There's also companion library: libfuse.  This library along with
+          utilities is available from the FUSE homepage:
+          <http://fuse.sourceforge.net/>
+          See <file:Documentation/filesystems/fuse.txt> for more information.
+          See <file:Documentation/Changes> for needed library/utility version.
+          If you want to develop a userspace FS, or if you want to use
+          a filesystem based on FUSE, answer Y or M.
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4f3cab321415..99c99dfb0373 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
        size_t size;
        if (!*ppos) {
+                long value;
                struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
                if (!fc)
                        return 0;
-                file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+                value = atomic_read(&fc->num_waiting);
+                file->private_data = (void *)value;
                fuse_conn_put(fc);
        }
        size = sprintf(tmp, "%ld\n", (long)file->private_data);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fba571648a8e..ba76b68c52ff 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
 * Called with fc->lock, unlocks it
 */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-        __releases(fc->lock)
+__releases(&fc->lock)
 {
        void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
        req->end = NULL;
@@ -281,7 +281,8 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
                        fc->blocked = 0;
                        wake_up_all(&fc->blocked_waitq);
                }
-                if (fc->num_background == FUSE_CONGESTION_THRESHOLD) {
+                if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+                    fc->connected) {
                        clear_bdi_congested(&fc->bdi, READ);
                        clear_bdi_congested(&fc->bdi, WRITE);
                }
@@ -293,13 +294,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
        wake_up(&req->waitq);
        if (end)
                end(fc, req);
-        else
+        fuse_put_request(fc, req);
-                fuse_put_request(fc, req);
 }
 static void wait_answer_interruptible(struct fuse_conn *fc,
                                      struct fuse_req *req)
-        __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        if (signal_pending(current))
                return;
@@ -317,7 +318,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 }
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-        __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        if (!fc->no_interrupt) {
                /* Any signal may interrupt this */
@@ -380,7 +382,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
        spin_lock(&fc->lock);
@@ -399,8 +401,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
        spin_unlock(&fc->lock);
 }
-static void request_send_nowait_locked(struct fuse_conn *fc,
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
-                                       struct fuse_req *req)
+                                            struct fuse_req *req)
 {
        req->background = 1;
        fc->num_background++;
@@ -414,11 +416,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
        flush_bg_queue(fc);
 }
-static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
        spin_lock(&fc->lock);
        if (fc->connected) {
-                request_send_nowait_locked(fc, req);
+                fuse_request_send_nowait_locked(fc, req);
                spin_unlock(&fc->lock);
        } else {
                req->out.h.error = -ENOTCONN;
@@ -426,16 +428,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 0;
-        request_send_nowait(fc, req);
+        fuse_request_send_nowait(fc, req);
 }
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
-        request_send_nowait(fc, req);
+        fuse_request_send_nowait(fc, req);
 }
 /*
@@ -443,10 +445,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 *
 * fc->connected must have been checked previously
 */
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+                                         struct fuse_req *req)
 {
        req->isreply = 1;
-        request_send_nowait_locked(fc, req);
+        fuse_request_send_nowait_locked(fc, req);
 }
 /*
@@ -539,8 +542,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                BUG_ON(!cs->nr_segs);
                cs->seglen = cs->iov[0].iov_len;
                cs->addr = (unsigned long) cs->iov[0].iov_base;
-                cs->iov ++;
+                cs->iov++;
-                cs->nr_segs --;
+                cs->nr_segs--;
        }
        down_read(&current->mm->mmap_sem);
        err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
@@ -589,9 +592,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
                kunmap_atomic(mapaddr, KM_USER1);
        }
        while (count) {
-                int err;
+                if (!cs->len) {
-                if (!cs->len && (err = fuse_copy_fill(cs)))
+                        int err = fuse_copy_fill(cs);
-                        return err;
+                        if (err)
+                                return err;
+                }
                if (page) {
                        void *mapaddr = kmap_atomic(page, KM_USER1);
                        void *buf = mapaddr + offset;
@@ -631,9 +636,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 {
        while (size) {
-                int err;
+                if (!cs->len) {
-                if (!cs->len && (err = fuse_copy_fill(cs)))
+                        int err = fuse_copy_fill(cs);
-                        return err;
+                        if (err)
+                                return err;
+                }
                fuse_copy_do(cs, &val, &size);
        }
        return 0;
@@ -664,6 +671,8 @@ static int request_pending(struct fuse_conn *fc)
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        DECLARE_WAITQUEUE(wait, current);
@@ -691,7 +700,7 @@ static void request_wait(struct fuse_conn *fc)
 */
 static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
                               const struct iovec *iov, unsigned long nr_segs)
-        __releases(fc->lock)
+__releases(&fc->lock)
 {
        struct fuse_copy_state cs;
        struct fuse_in_header ih;
@@ -813,6 +822,40 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        return err;
 }
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+                            struct fuse_copy_state *cs)
+{
+        struct fuse_notify_poll_wakeup_out outarg;
+        int err = -EINVAL;
+        if (size != sizeof(outarg))
+                goto err;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto err;
+        fuse_copy_finish(cs);
+        return fuse_notify_poll_wakeup(fc, &outarg);
+err:
+        fuse_copy_finish(cs);
+        return err;
+}
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+                       unsigned int size, struct fuse_copy_state *cs)
+{
+        switch (code) {
+        case FUSE_NOTIFY_POLL:
+                return fuse_notify_poll(fc, size, cs);
+        default:
+                fuse_copy_finish(cs);
+                return -EINVAL;
+        }
+}
 /* Look up request on processing list by unique ID */
 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 {
@@ -876,9 +919,22 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        err = fuse_copy_one(&cs, &oh, sizeof(oh));
        if (err)
                goto err_finish;
+        err = -EINVAL;
+        if (oh.len != nbytes)
+                goto err_finish;
+        /*
+         * Zero oh.unique indicates unsolicited notification message
+         * and error contains notification code.
+         */
+        if (!oh.unique) {
+                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+                return err ? err : nbytes;
+        }
        err = -EINVAL;
-        if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
+        if (oh.error <= -1000 || oh.error > 0)
-            oh.len != nbytes)
                goto err_finish;
        spin_lock(&fc->lock);
@@ -966,6 +1022,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 * This function releases and reacquires fc->lock
 */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        while (!list_empty(head)) {
                struct fuse_req *req;
@@ -988,7 +1046,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 * locked).
 */
 static void end_io_requests(struct fuse_conn *fc)
-        __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        while (!list_empty(&fc->io)) {
                struct fuse_req *req =
@@ -1002,11 +1061,11 @@ static void end_io_requests(struct fuse_conn *fc)
                wake_up(&req->waitq);
                if (end) {
                        req->end = NULL;
-                        /* The end function will consume this reference */
                        __fuse_get_request(req);
                        spin_unlock(&fc->lock);
                        wait_event(req->waitq, !req->locked);
                        end(fc, req);
+                        fuse_put_request(fc, req);
                        spin_lock(&fc->lock);
                }
        }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 95bc22bdd060..fdff346e96fd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                parent = dget_parent(entry);
                fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
                                 &entry->d_name, &outarg);
-                request_send(fc, req);
+                fuse_request_send(fc, req);
                dput(parent);
                err = req->out.h.error;
                fuse_put_request(fc, req);
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                                return 0;
                        }
                        spin_lock(&fc->lock);
-                        fi->nlookup ++;
+                        fi->nlookup++;
                        spin_unlock(&fc->lock);
                }
                fuse_put_request(fc, forget_req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
        attr_version = fuse_get_attr_version(fc);
        fuse_lookup_init(fc, req, nodeid, name, outarg);
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        /* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
 {
        fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
        ff->reserved_req->force = 1;
-        request_send(fc, ff->reserved_req);
+        fuse_request_send(fc, ff->reserved_req);
        fuse_put_request(fc, ff->reserved_req);
        kfree(ff);
 }
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
                goto out_put_forget_req;
        err = -ENOMEM;
-        ff = fuse_file_alloc();
+        ff = fuse_file_alloc(fc);
        if (!ff)
                goto out_put_request;
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        req->out.args[0].value = &outentry;
        req->out.args[1].size = sizeof(outopen);
        req->out.args[1].value = &outopen;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        if (err) {
                if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        else
                req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err)
@@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
                struct inode *inode = entry->d_inode;
-                /* Set nlink to zero so the inode can be cleared, if
+                /*
-                   the inode does have more links this will be
+                 * Set nlink to zero so the inode can be cleared, if the inode
-                   discovered at the next lookup/getattr */
+                 * does have more links this will be discovered at the next
+                 * lookup/getattr.
+                 */
                clear_nlink(inode);
                fuse_invalidate_attr(inode);
                fuse_invalidate_attr(dir);
@@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
@@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        req->in.args[1].value = oldent->d_name.name;
        req->in.args[2].size = newent->d_name.len + 1;
        req->in.args[2].value = newent->d_name.name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
@@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
        else
                req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
@@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask)
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
        req->num_pages = 1;
        req->pages[0] = page;
        fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        nbytes = req->out.args[0].size;
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry)
        req->out.numargs = 1;
        req->out.args[0].size = PAGE_SIZE - 1;
        req->out.args[0].value = link;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        if (req->out.h.error) {
                free_page((unsigned long) link);
                link = ERR_PTR(req->out.h.error);
@@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        else
                req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err) {
@@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
        req->in.args[1].value = name;
        req->in.args[2].size = size;
        req->in.args[2].value = value;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
                req->out.args[0].size = sizeof(outarg);
                req->out.args[0].value = &outarg;
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        ret = req->out.h.error;
        if (!ret)
                ret = size ? req->out.args[0].size : outarg.size;
@@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
                req->out.args[0].size = sizeof(outarg);
                req->out.args[0].value = &outarg;
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        ret = req->out.h.error;
        if (!ret)
                ret = size ? req->out.args[0].size : outarg.size;
@@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
        req->in.numargs = 1;
        req->in.args[0].size = strlen(name) + 1;
        req->in.args[0].value = name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b82..d9fdb7cec538 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(*outargp);
        req->out.args[0].value = outargp;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        return err;
 }
-struct fuse_file *fuse_file_alloc(void)
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
        struct fuse_file *ff;
        ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -54,11 +54,16 @@ struct fuse_file *fuse_file_alloc(void)
                ff->reserved_req = fuse_request_alloc();
                if (!ff->reserved_req) {
                        kfree(ff);
-                        ff = NULL;
+                        return NULL;
                } else {
                        INIT_LIST_HEAD(&ff->write_entry);
                        atomic_set(&ff->count, 0);
+                        spin_lock(&fc->lock);
+                        ff->kh = ++fc->khctr;
+                        spin_unlock(&fc->lock);
                }
+                RB_CLEAR_NODE(&ff->polled_node);
+                init_waitqueue_head(&ff->poll_wait);
        }
        return ff;
 }
@@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
        dput(req->misc.release.dentry);
        mntput(req->misc.release.vfsmount);
-        fuse_put_request(fc, req);
 }
 static void fuse_file_put(struct fuse_file *ff)
@@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff)
                struct inode *inode = req->misc.release.dentry->d_inode;
                struct fuse_conn *fc = get_fuse_conn(inode);
                req->end = fuse_release_end;
-                request_send_background(fc, req);
+                fuse_request_send_background(fc, req);
                kfree(ff);
        }
 }
@@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file,
 int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 {
+        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_open_out outarg;
        struct fuse_file *ff;
        int err;
@@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
        if (err)
                return err;
-        ff = fuse_file_alloc();
+        ff = fuse_file_alloc(fc);
        if (!ff)
                return -ENOMEM;
@@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
                spin_lock(&fc->lock);
                list_del(&ff->write_entry);
+                if (!RB_EMPTY_NODE(&ff->polled_node))
+                        rb_erase(&ff->polled_node, &fc->polled_files);
                spin_unlock(&fc->lock);
+                wake_up_interruptible_sync(&ff->poll_wait);
                /*
                 * Normally this will send the RELEASE request,
                 * however if some asynchronous READ or WRITE requests
@@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
        req->force = 1;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
                inarg->read_flags |= FUSE_READ_LOCKOWNER;
                inarg->lock_owner = fuse_lock_owner_id(fc, owner);
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        return req->out.args[0].size;
 }
@@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
        }
        if (req->ff)
                fuse_file_put(req->ff);
-        fuse_put_request(fc, req);
 }
 static void fuse_send_readpages(struct fuse_req *req, struct file *file,
@@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
                struct fuse_file *ff = file->private_data;
                req->ff = fuse_file_get(ff);
                req->end = fuse_readpages_end;
-                request_send_background(fc, req);
+                fuse_request_send_background(fc, req);
        } else {
-                request_send(fc, req);
+                fuse_request_send(fc, req);
                fuse_readpages_end(fc, req);
+                fuse_put_request(fc, req);
        }
 }
@@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
                }
        }
        req->pages[req->num_pages] = page;
-        req->num_pages ++;
+        req->num_pages++;
        return 0;
 }
@@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
                inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
                inarg->lock_owner = fuse_lock_owner_id(fc, owner);
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        return req->misc.write.out.size;
 }
@@ -646,7 +655,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = __grab_cache_page(mapping, index);
+        *pagep = grab_cache_page_write_begin(mapping, index, flags);
        if (!*pagep)
                return -ENOMEM;
        return 0;
@@ -779,7 +788,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                        break;
                err = -ENOMEM;
-                page = __grab_cache_page(mapping, index);
+                page = grab_cache_page_write_begin(mapping, index, 0);
                if (!page)
                        break;
@@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
        __free_page(req->pages[0]);
        fuse_file_put(req->ff);
-        fuse_put_request(fc, req);
 }
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 /* Called under fc->lock, may release and reacquire it */
 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        struct fuse_inode *fi = get_fuse_inode(req->inode);
        loff_t size = i_size_read(req->inode);
@@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
        req->in.args[1].size = inarg->size;
        fi->writectr++;
-        request_send_background_locked(fc, req);
+        fuse_request_send_background_locked(fc, req);
        return;
 out_free:
        fuse_writepage_finish(fc, req);
        spin_unlock(&fc->lock);
        fuse_writepage_free(fc, req);
+        fuse_put_request(fc, req);
        spin_lock(&fc->lock);
 }
@@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
 * Called with fc->lock
 */
 void fuse_flush_writepages(struct inode *inode)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err)
@@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
                return PTR_ERR(req);
        fuse_lk_fill(req, file, fl, opcode, pid, flock);
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        /* locking is restartable */
        if (err == -EINTR)
@@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS)
@@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
        return retval;
 }
+static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
+                        unsigned int nr_segs, size_t bytes, bool to_user)
+{
+        struct iov_iter ii;
+        int page_idx = 0;
+        if (!bytes)
+                return 0;
+        iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+        while (iov_iter_count(&ii)) {
+                struct page *page = pages[page_idx++];
+                size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
+                void *kaddr, *map;
+                kaddr = map = kmap(page);
+                while (todo) {
+                        char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
+                        size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+                        size_t copy = min(todo, iov_len);
+                        size_t left;
+                        if (!to_user)
+                                left = copy_from_user(kaddr, uaddr, copy);
+                        else
+                                left = copy_to_user(uaddr, kaddr, copy);
+                        if (unlikely(left))
+                                return -EFAULT;
+                        iov_iter_advance(&ii, copy);
+                        todo -= copy;
+                        kaddr += copy;
+                }
+                kunmap(map);
+        }
+        return 0;
+}
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written.  Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs.  Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ *      char    *buf;
+ *      size_t  buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a)     },
+ *   { .iov_base = a.buf,       .iov_len = a.buflen             } }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY.  This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
+                               unsigned long arg, unsigned int flags)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_ioctl_in inarg = {
+                .fh = ff->fh,
+                .cmd = cmd,
+                .arg = arg,
+                .flags = flags
+        };
+        struct fuse_ioctl_out outarg;
+        struct fuse_req *req = NULL;
+        struct page **pages = NULL;
+        struct page *iov_page = NULL;
+        struct iovec *in_iov = NULL, *out_iov = NULL;
+        unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
+        size_t in_size, out_size, transferred;
+        int err;
+        /* assume all the iovs returned by client always fits in a page */
+        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+        if (!fuse_allow_task(fc, current))
+                return -EACCES;
+        err = -EIO;
+        if (is_bad_inode(inode))
+                goto out;
+        err = -ENOMEM;
+        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+        iov_page = alloc_page(GFP_KERNEL);
+        if (!pages || !iov_page)
+                goto out;
+        /*
+         * If restricted, initialize IO parameters as encoded in @cmd.
+         * RETRY from server is not allowed.
+         */
+        if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+                struct iovec *iov = page_address(iov_page);
+                iov->iov_base = (void __user *)arg;
+                iov->iov_len = _IOC_SIZE(cmd);
+                if (_IOC_DIR(cmd) & _IOC_WRITE) {
+                        in_iov = iov;
+                        in_iovs = 1;
+                }
+                if (_IOC_DIR(cmd) & _IOC_READ) {
+                        out_iov = iov;
+                        out_iovs = 1;
+                }
+        }
+ retry:
+        inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+        inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+        /*
+         * Out data can be used either for actual out data or iovs,
+         * make sure there always is at least one page.
+         */
+        out_size = max_t(size_t, out_size, PAGE_SIZE);
+        max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+        /* make sure there are enough buffer pages and init request with them */
+        err = -ENOMEM;
+        if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+                goto out;
+        while (num_pages < max_pages) {
+                pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+                if (!pages[num_pages])
+                        goto out;
+                num_pages++;
+        }
+        req = fuse_get_req(fc);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                req = NULL;
+                goto out;
+        }
+        memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
+        req->num_pages = num_pages;
+        /* okay, let's send it to the client */
+        req->in.h.opcode = FUSE_IOCTL;
+        req->in.h.nodeid = get_node_id(inode);
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        if (in_size) {
+                req->in.numargs++;
+                req->in.args[1].size = in_size;
+                req->in.argpages = 1;
+                err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
+                                           false);
+                if (err)
+                        goto out;
+        }
+        req->out.numargs = 2;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        req->out.args[1].size = out_size;
+        req->out.argpages = 1;
+        req->out.argvar = 1;
+        fuse_request_send(fc, req);
+        err = req->out.h.error;
+        transferred = req->out.args[1].size;
+        fuse_put_request(fc, req);
+        req = NULL;
+        if (err)
+                goto out;
+        /* did it ask for retry? */
+        if (outarg.flags & FUSE_IOCTL_RETRY) {
+                char *vaddr;
+                /* no retry if in restricted mode */
+                err = -EIO;
+                if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+                        goto out;
+                in_iovs = outarg.in_iovs;
+                out_iovs = outarg.out_iovs;
+                /*
+                 * Make sure things are in boundary, separate checks
+                 * are to protect against overflow.
+                 */
+                err = -ENOMEM;
+                if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+                    out_iovs > FUSE_IOCTL_MAX_IOV ||
+                    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+                        goto out;
+                err = -EIO;
+                if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
+                        goto out;
+                /* okay, copy in iovs and retry */
+                vaddr = kmap_atomic(pages[0], KM_USER0);
+                memcpy(page_address(iov_page), vaddr, transferred);
+                kunmap_atomic(vaddr, KM_USER0);
+                in_iov = page_address(iov_page);
+                out_iov = in_iov + in_iovs;
+                goto retry;
+        }
+        err = -EIO;
+        if (transferred > inarg.out_size)
+                goto out;
+        err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ out:
+        if (req)
+                fuse_put_request(fc, req);
+        if (iov_page)
+                __free_page(iov_page);
+        while (num_pages)
+                __free_page(pages[--num_pages]);
+        kfree(pages);
+        return err ? err : outarg.result;
+}
+static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+                            unsigned long arg)
+{
+        return fuse_file_do_ioctl(file, cmd, arg, 0);
+}
+static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+                                   unsigned long arg)
+{
+        return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+                                              struct rb_node **parent_out)
+{
+        struct rb_node **link = &fc->polled_files.rb_node;
+        struct rb_node *last = NULL;
+        while (*link) {
+                struct fuse_file *ff;
+                last = *link;
+                ff = rb_entry(last, struct fuse_file, polled_node);
+                if (kh < ff->kh)
+                        link = &last->rb_left;
+                else if (kh > ff->kh)
+                        link = &last->rb_right;
+                else
+                        return link;
+        }
+        if (parent_out)
+                *parent_out = last;
+        return link;
+}
+/*
+ * The file is about to be polled.  Make sure it's on the polled_files
+ * RB tree.  Note that files once added to the polled_files tree are
+ * not removed before the file is released.  This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+                                      struct fuse_file *ff)
+{
+        spin_lock(&fc->lock);
+        if (RB_EMPTY_NODE(&ff->polled_node)) {
+                struct rb_node **link, *parent;
+                link = fuse_find_polled_node(fc, ff->kh, &parent);
+                BUG_ON(*link);
+                rb_link_node(&ff->polled_node, parent, link);
+                rb_insert_color(&ff->polled_node, &fc->polled_files);
+        }
+        spin_unlock(&fc->lock);
+}
+static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+        struct fuse_poll_out outarg;
+        struct fuse_req *req;
+        int err;
+        if (fc->no_poll)
+                return DEFAULT_POLLMASK;
+        poll_wait(file, &ff->poll_wait, wait);
+        /*
+         * Ask for notification iff there's someone waiting for it.
+         * The client may ignore the flag and always notify.
+         */
+        if (waitqueue_active(&ff->poll_wait)) {
+                inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+                fuse_register_polled_file(fc, ff);
+        }
+        req = fuse_get_req(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->in.h.opcode = FUSE_POLL;
+        req->in.h.nodeid = get_node_id(inode);
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        fuse_request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err)
+                return outarg.revents;
+        if (err == -ENOSYS) {
+                fc->no_poll = 1;
+                return DEFAULT_POLLMASK;
+        }
+        return POLLERR;
+}
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+                            struct fuse_notify_poll_wakeup_out *outarg)
+{
+        u64 kh = outarg->kh;
+        struct rb_node **link;
+        spin_lock(&fc->lock);
+        link = fuse_find_polled_node(fc, kh, NULL);
+        if (*link) {
+                struct fuse_file *ff;
+                ff = rb_entry(*link, struct fuse_file, polled_node);
+                wake_up_interruptible_sync(&ff->poll_wait);
+        }
+        spin_unlock(&fc->lock);
+        return 0;
+}
 static const struct file_operations fuse_file_operations = {
        .llseek         = fuse_file_llseek,
        .read           = do_sync_read,
@@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = {
        .lock           = fuse_file_lock,
        .flock          = fuse_file_flock,
        .splice_read    = generic_file_splice_read,
+        .unlocked_ioctl = fuse_file_ioctl,
+        .compat_ioctl   = fuse_file_compat_ioctl,
+        .poll           = fuse_file_poll,
 };
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = {
        .fsync          = fuse_fsync,
        .lock           = fuse_file_lock,
        .flock          = fuse_file_flock,
+        .unlocked_ioctl = fuse_file_ioctl,
+        .compat_ioctl   = fuse_file_compat_ioctl,
+        .poll           = fuse_file_poll,
        /* no mmap and splice_read */
 };
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 35accfdd747f..5e64b815a5a1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -19,6 +19,8 @@
 #include <linux/backing-dev.h>
 #include <linux/mutex.h>
 #include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -100,6 +102,9 @@ struct fuse_file {
        /** Request reserved for flush and release */
        struct fuse_req *reserved_req;
+        /** Kernel file handle guaranteed to be unique */
+        u64 kh;
        /** File handle used by userspace */
        u64 fh;
@@ -108,6 +113,12 @@ struct fuse_file {
        /** Entry on inode's write_files list */
        struct list_head write_entry;
+        /** RB node to be linked on fuse_conn->polled_files */
+        struct rb_node polled_node;
+        /** Wait queue head for poll */
+        wait_queue_head_t poll_wait;
 };
 /** One input argument of a request */
@@ -322,6 +333,12 @@ struct fuse_conn {
        /** The list of requests under I/O */
        struct list_head io;
+        /** The next unique kernel file handle */
+        u64 khctr;
+        /** rbtree of fuse_files waiting for poll events indexed by ph */
+        struct rb_root polled_files;
        /** Number of requests currently in the background */
        unsigned num_background;
@@ -355,19 +372,19 @@ struct fuse_conn {
        /** Connection failed (version mismatch).  Cannot race with
            setting other bitfields since it is only set once in INIT
            reply, before any other request, and never cleared */
-        unsigned conn_error : 1;
+        unsigned conn_error:1;
        /** Connection successful.  Only set in INIT */
-        unsigned conn_init : 1;
+        unsigned conn_init:1;
        /** Do readpages asynchronously?  Only set in INIT */
-        unsigned async_read : 1;
+        unsigned async_read:1;
        /** Do not send separate SETATTR request before open(O_TRUNC)  */
-        unsigned atomic_o_trunc : 1;
+        unsigned atomic_o_trunc:1;
        /** Filesystem supports NFS exporting.  Only set in INIT */
-        unsigned export_support : 1;
+        unsigned export_support:1;
        /*
         * The following bitfields are only for optimization purposes
@@ -375,43 +392,46 @@ struct fuse_conn {
         */
        /** Is fsync not implemented by fs? */
-        unsigned no_fsync : 1;
+        unsigned no_fsync:1;
        /** Is fsyncdir not implemented by fs? */
-        unsigned no_fsyncdir : 1;
+        unsigned no_fsyncdir:1;
        /** Is flush not implemented by fs? */
-        unsigned no_flush : 1;
+        unsigned no_flush:1;
        /** Is setxattr not implemented by fs? */
-        unsigned no_setxattr : 1;
+        unsigned no_setxattr:1;
        /** Is getxattr not implemented by fs? */
-        unsigned no_getxattr : 1;
+        unsigned no_getxattr:1;
        /** Is listxattr not implemented by fs? */
-        unsigned no_listxattr : 1;
+        unsigned no_listxattr:1;
        /** Is removexattr not implemented by fs? */
-        unsigned no_removexattr : 1;
+        unsigned no_removexattr:1;
        /** Are file locking primitives not implemented by fs? */
-        unsigned no_lock : 1;
+        unsigned no_lock:1;
        /** Is access not implemented by fs? */
-        unsigned no_access : 1;
+        unsigned no_access:1;
        /** Is create not implemented by fs? */
-        unsigned no_create : 1;
+        unsigned no_create:1;
        /** Is interrupt not implemented by fs? */
-        unsigned no_interrupt : 1;
+        unsigned no_interrupt:1;
        /** Is bmap not implemented by fs? */
-        unsigned no_bmap : 1;
+        unsigned no_bmap:1;
+        /** Is poll not implemented by fs? */
+        unsigned no_poll:1;
        /** Do multi-page cached writes */
-        unsigned big_writes : 1;
+        unsigned big_writes:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
@@ -445,6 +465,9 @@ struct fuse_conn {
        /** Version counter for attribute changes */
        u64 attr_version;
+        /** Called on final put */
+        void (*release)(struct fuse_conn *);
 };
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 */
 int fuse_open_common(struct inode *inode, struct file *file, int isdir);
-struct fuse_file *fuse_file_alloc(void);
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
 void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file,
                      struct fuse_file *ff, struct fuse_open_out *outarg);
@@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
                      int isdir);
 /**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+                            struct fuse_notify_poll_wakeup_out *outarg);
+/**
 * Initialize file operations on a regular file
 */
 void fuse_init_file_inode(struct inode *inode);
@@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 /**
 * Send a request (synchronous)
 */
-void request_send(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 /**
 * Send a request with no reply
 */
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 /**
 * Send a request in the background
 */
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+                                         struct fuse_req *req);
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
@@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
 /**
+ * Initialize fuse_conn
+ */
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+/**
 * Release reference to fuse_conn
 */
 void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e99f34b4435..459b73dd45e1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -37,10 +37,10 @@ struct fuse_mount_data {
        unsigned rootmode;
        unsigned user_id;
        unsigned group_id;
-        unsigned fd_present : 1;
+        unsigned fd_present:1;
-        unsigned rootmode_present : 1;
+        unsigned rootmode_present:1;
-        unsigned user_id_present : 1;
+        unsigned user_id_present:1;
-        unsigned group_id_present : 1;
+        unsigned group_id_present:1;
        unsigned flags;
        unsigned max_read;
        unsigned blksize;
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_forget_in);
        req->in.args[0].value = inarg;
-        request_send_noreply(fc, req);
+        fuse_request_send_noreply(fc, req);
 }
 static void fuse_clear_inode(struct inode *inode)
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
        fi = get_fuse_inode(inode);
        spin_lock(&fc->lock);
-        fi->nlookup ++;
+        fi->nlookup++;
        spin_unlock(&fc->lock);
        fuse_change_attributes(inode, attr, attr_valid, attr_version);
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
                fc->destroy_req = NULL;
                req->in.h.opcode = FUSE_DESTROY;
                req->force = 1;
-                request_send(fc, req);
+                fuse_request_send(fc, req);
                fuse_put_request(fc, req);
        }
 }
@@ -292,6 +292,7 @@ static void fuse_put_super(struct super_block *sb)
        list_del(&fc->entry);
        fuse_ctl_remove_conn(fc);
        mutex_unlock(&fuse_mutex);
+        bdi_destroy(&fc->bdi);
        fuse_conn_put(fc);
 }
@@ -334,7 +335,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
        req->out.args[0].size =
                fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        if (!err)
                convert_fuse_statfs(buf, &outarg.st);
@@ -462,68 +463,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
-static struct fuse_conn *new_conn(struct super_block *sb)
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
 {
-        struct fuse_conn *fc;
        int err;
-        fc = kzalloc(sizeof(*fc), GFP_KERNEL);
+        memset(fc, 0, sizeof(*fc));
-        if (fc) {
+        spin_lock_init(&fc->lock);
-                spin_lock_init(&fc->lock);
+        mutex_init(&fc->inst_mutex);
-                mutex_init(&fc->inst_mutex);
+        atomic_set(&fc->count, 1);
-                atomic_set(&fc->count, 1);
+        init_waitqueue_head(&fc->waitq);
-                init_waitqueue_head(&fc->waitq);
+        init_waitqueue_head(&fc->blocked_waitq);
-                init_waitqueue_head(&fc->blocked_waitq);
+        init_waitqueue_head(&fc->reserved_req_waitq);
-                init_waitqueue_head(&fc->reserved_req_waitq);
+        INIT_LIST_HEAD(&fc->pending);
-                INIT_LIST_HEAD(&fc->pending);
+        INIT_LIST_HEAD(&fc->processing);
-                INIT_LIST_HEAD(&fc->processing);
+        INIT_LIST_HEAD(&fc->io);
-                INIT_LIST_HEAD(&fc->io);
+        INIT_LIST_HEAD(&fc->interrupts);
-                INIT_LIST_HEAD(&fc->interrupts);
+        INIT_LIST_HEAD(&fc->bg_queue);
-                INIT_LIST_HEAD(&fc->bg_queue);
+        INIT_LIST_HEAD(&fc->entry);
-                atomic_set(&fc->num_waiting, 0);
+        atomic_set(&fc->num_waiting, 0);
-                fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-                fc->bdi.unplug_io_fn = default_unplug_io_fn;
+        fc->bdi.unplug_io_fn = default_unplug_io_fn;
-                /* fuse does it's own writeback accounting */
+        /* fuse does it's own writeback accounting */
-                fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+        fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
-                fc->dev = sb->s_dev;
+        fc->khctr = 0;
-                err = bdi_init(&fc->bdi);
+        fc->polled_files = RB_ROOT;
-                if (err)
+        fc->dev = sb->s_dev;
-                        goto error_kfree;
+        err = bdi_init(&fc->bdi);
-                if (sb->s_bdev) {
+        if (err)
-                        err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+                goto error_mutex_destroy;
-                                           MAJOR(fc->dev), MINOR(fc->dev));
+        if (sb->s_bdev) {
-                } else {
+                err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
-                        err = bdi_register_dev(&fc->bdi, fc->dev);
+                                   MAJOR(fc->dev), MINOR(fc->dev));
-                }
+        } else {
-                if (err)
+                err = bdi_register_dev(&fc->bdi, fc->dev);
-                        goto error_bdi_destroy;
-                /*
-                 * For a single fuse filesystem use max 1% of dirty +
-                 * writeback threshold.
-                 *
-                 * This gives about 1M of write buffer for memory maps on a
-                 * machine with 1G and 10% dirty_ratio, which should be more
-                 * than enough.
-                 *
-                 * Privileged users can raise it by writing to
-                 *
-                 *    /sys/class/bdi/<bdi>/max_ratio
-                 */
-                bdi_set_max_ratio(&fc->bdi, 1);
-                fc->reqctr = 0;
-                fc->blocked = 1;
-                fc->attr_version = 1;
-                get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
        }
-        return fc;
+        if (err)
+                goto error_bdi_destroy;
+        /*
+         * For a single fuse filesystem use max 1% of dirty +
+         * writeback threshold.
+         *
+         * This gives about 1M of write buffer for memory maps on a
+         * machine with 1G and 10% dirty_ratio, which should be more
+         * than enough.
+         *
+         * Privileged users can raise it by writing to
+         *
+         *    /sys/class/bdi/<bdi>/max_ratio
+         */
+        bdi_set_max_ratio(&fc->bdi, 1);
+        fc->reqctr = 0;
+        fc->blocked = 1;
+        fc->attr_version = 1;
+        get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
-error_bdi_destroy:
+        return 0;
+ error_bdi_destroy:
        bdi_destroy(&fc->bdi);
-error_kfree:
+ error_mutex_destroy:
        mutex_destroy(&fc->inst_mutex);
-        kfree(fc);
+        return err;
-        return NULL;
 }
+EXPORT_SYMBOL_GPL(fuse_conn_init);
 void fuse_conn_put(struct fuse_conn *fc)
 {
@@ -531,8 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc)
                if (fc->destroy_req)
                        fuse_request_free(fc->destroy_req);
                mutex_destroy(&fc->inst_mutex);
-                bdi_destroy(&fc->bdi);
+                fc->release(fc);
-                kfree(fc);
        }
 }
@@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
        return fc;
 }
-static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 {
        struct fuse_attr attr;
        memset(&attr, 0, sizeof(attr));
@@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
        return fuse_iget(sb, 1, 0, &attr, 0, 0);
 }
-struct fuse_inode_handle
+struct fuse_inode_handle {
-{
        u64 nodeid;
        u32 generation;
 };
@@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                fc->max_write = max_t(unsigned, 4096, fc->max_write);
                fc->conn_init = 1;
        }
-        fuse_put_request(fc, req);
        fc->blocked = 0;
        wake_up_all(&fc->blocked_waitq);
 }
@@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        req->out.args[0].size = sizeof(struct fuse_init_out);
        req->out.args[0].value = &req->misc.init_out;
        req->end = process_init_reply;
-        request_send_background(fc, req);
+        fuse_request_send_background(fc, req);
+}
+static void fuse_free_conn(struct fuse_conn *fc)
+{
+        kfree(fc);
 }
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -801,16 +805,18 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        int err;
        int is_bdev = sb->s_bdev != NULL;
+        err = -EINVAL;
        if (sb->s_flags & MS_MANDLOCK)
-                return -EINVAL;
+                goto err;
        if (!parse_fuse_opt((char *) data, &d, is_bdev))
-                return -EINVAL;
+                goto err;
        if (is_bdev) {
 #ifdef CONFIG_BLOCK
+                err = -EINVAL;
                if (!sb_set_blocksize(sb, d.blksize))
-                        return -EINVAL;
+                        goto err;
 #endif
        } else {
                sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -822,16 +828,25 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_export_op = &fuse_export_operations;
        file = fget(d.fd);
+        err = -EINVAL;
        if (!file)
-                return -EINVAL;
+                goto err;
        if (file->f_op != &fuse_dev_operations)
-                return -EINVAL;
+                goto err_fput;
-        fc = new_conn(sb);
+        fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+        err = -ENOMEM;
        if (!fc)
-                return -ENOMEM;
+                goto err_fput;
+        err = fuse_conn_init(fc, sb);
+        if (err) {
+                kfree(fc);
+                goto err_fput;
+        }
+        fc->release = fuse_free_conn;
        fc->flags = d.flags;
        fc->user_id = d.user_id;
        fc->group_id = d.group_id;
@@ -841,14 +856,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = fc;
        err = -ENOMEM;
-        root = get_root_inode(sb, d.rootmode);
+        root = fuse_get_root_inode(sb, d.rootmode);
        if (!root)
-                goto err;
+                goto err_put_conn;
        root_dentry = d_alloc_root(root);
        if (!root_dentry) {
                iput(root);
-                goto err;
+                goto err_put_conn;
        }
        init_req = fuse_request_alloc();
@@ -892,9 +907,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        fuse_request_free(init_req);
 err_put_root:
        dput(root_dentry);
- err:
+ err_put_conn:
-        fput(file);
        fuse_conn_put(fc);
+ err_fput:
+        fput(file);
+ err:
        return err;
 }
@@ -952,7 +969,7 @@ static inline void unregister_fuseblk(void)
 static void fuse_inode_init_once(void *foo)
 {
-        struct inode * inode = foo;
+        struct inode *inode = foo;
        inode_init_once(inode);
 }
@@ -1031,7 +1048,7 @@ static int __init fuse_init(void)
 {
        int res;
-        printk("fuse init (API version %i.%i)\n",
+        printk(KERN_INFO "fuse init (API version %i.%i)\n",
               FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
        INIT_LIST_HEAD(&fuse_conn_list);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index ab2f57e3fb87..e563a6449811 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
        tristate "GFS2 file system support"
-        depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
+        depends on EXPERIMENTAL && (64BIT || LBD)
        select FS_POSIX_ACL
        select CRC32
        help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index ec65851ec80a..c1b4ec6a9650 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
        glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
        mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
        ops_fstype.o ops_inode.o ops_super.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e9bd46f27e3..e335dceb6a4f 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
        struct gfs2_ea_location el_this;
        int error;
-        if (!ip->i_di.di_eattr)
+        if (!ip->i_eattr)
                return 0;
        memset(&er, 0, sizeof(struct gfs2_ea_request));
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bec76b1c2bb0..11ffc56f1f81 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
                void *kaddr = kmap(page);
                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-                       ip->i_di.di_size);
+                       ip->i_disksize);
-                memset(kaddr + ip->i_di.di_size, 0,
+                memset(kaddr + ip->i_disksize, 0,
-                       PAGE_CACHE_SIZE - ip->i_di.di_size);
+                       PAGE_CACHE_SIZE - ip->i_disksize);
                kunmap(page);
                SetPageUptodate(page);
@@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        if (error)
                goto out;
-        if (ip->i_di.di_size) {
+        if (ip->i_disksize) {
                /* Get a free block, fill it with the stuffed data,
                   and write it out to disk */
@@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        di = (struct gfs2_dinode *)dibh->b_data;
        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
-        if (ip->i_di.di_size) {
+        if (ip->i_disksize) {
                *(__be64 *)(di + 1) = cpu_to_be64(block);
                gfs2_add_inode_blocks(&ip->i_inode, 1);
                di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
                }
        }
-        ip->i_di.di_size = size;
+        ip->i_disksize = size;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                goto out;
        if (gfs2_is_stuffed(ip)) {
-                ip->i_di.di_size = size;
+                ip->i_disksize = size;
                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
@@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
                if (!error) {
-                        ip->i_di.di_size = size;
+                        ip->i_disksize = size;
                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-                        ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+                        ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                        gfs2_dinode_out(ip, dibh->b_data);
                }
@@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip)
        if (error)
                goto out;
-        if (!ip->i_di.di_size) {
+        if (!ip->i_disksize) {
                ip->i_height = 0;
                ip->i_goal = ip->i_no_addr;
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        }
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-        ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+        ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
                return -EINVAL;
-        if (size > ip->i_di.di_size)
+        if (size > ip->i_disksize)
                error = do_grow(ip, size);
-        else if (size < ip->i_di.di_size)
+        else if (size < ip->i_disksize)
                error = do_shrink(ip, size);
        else
                /* update time stamps */
@@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
        int error;
-        error = trunc_dealloc(ip, ip->i_di.di_size);
+        error = trunc_dealloc(ip, ip->i_disksize);
        if (!error)
                error = trunc_end(ip);
        return error;
@@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
 }
 /**
- * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
- * @ip: the file
- * @len: the number of bytes to be written to the file
- * @data_blocks: returns the number of data blocks required
- * @ind_blocks: returns the number of indirect blocks required
- *
- */
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-                            unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        unsigned int tmp;
-        if (gfs2_is_dir(ip)) {
-                *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
-                *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
-        } else {
-                *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
-                *ind_blocks = 3 * (sdp->sd_max_height - 1);
-        }
-        for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
-                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-                *ind_blocks += tmp;
-        }
-}
-/**
 * gfs2_write_alloc_required - figure out if a write will require an allocation
 * @ip: the file being written to
 * @offset: the offset to write to
@@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
        struct buffer_head bh;
        unsigned int shift;
        u64 lblock, lblock_stop, size;
+        u64 end_of_file;
        *alloc_required = 0;
@@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
        *alloc_required = 1;
        shift = sdp->sd_sb.sb_bsize_shift;
-        if (gfs2_is_dir(ip)) {
+        BUG_ON(gfs2_is_dir(ip));
-                unsigned int bsize = sdp->sd_jbsize;
+        end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
-                lblock = offset;
+        lblock = offset >> shift;
-                do_div(lblock, bsize);
+        lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-                lblock_stop = offset + len + bsize - 1;
+        if (lblock_stop > end_of_file)
-                do_div(lblock_stop, bsize);
+                return 0;
-        } else {
-                u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
-                lblock = offset >> shift;
-                lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-                if (lblock_stop > end_of_file)
-                        return 0;
-        }
        size = (lblock_stop - lblock) << shift;
        do {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e6cde2943bd..c983177e05ac 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,10 +10,40 @@
 #ifndef __BMAP_DOT_H__
 #define __BMAP_DOT_H__
+#include "inode.h"
 struct inode;
 struct gfs2_inode;
 struct page;
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
+                                          unsigned int len,
+                                          unsigned int *data_blocks,
+                                          unsigned int *ind_blocks)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int tmp;
+        BUG_ON(gfs2_is_dir(ip));
+        *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+        *ind_blocks = 3 * (sdp->sd_max_height - 1);
+        for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+                *ind_blocks += tmp;
+        }
+}
 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
 int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
@@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
 int gfs2_truncatei_resume(struct gfs2_inode *ip);
 int gfs2_file_dealloc(struct gfs2_inode *ip);
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-                            unsigned int *data_blocks,
-                            unsigned int *ind_blocks);
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                              unsigned int len, int *alloc_required);
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
deleted file mode 100644
index e51991947d2c..000000000000
--- a/fs/gfs2/daemon.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-#include <linux/freezer.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "daemon.h"
-#include "glock.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "super.h"
-#include "util.h"
-/* This uses schedule_timeout() instead of msleep() because it's good for
-   the daemons to wake up more often than the timeout when unmounting so
-   the user's unmount doesn't sit there forever.
-   The kthread functions used to start these daemons block and flush signals. */
-/**
- * gfs2_glockd - Reclaim unused glock structures
- * @sdp: Pointer to GFS2 superblock
- *
- * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
- * Number of daemons can be set by user, with num_glockd mount option.
- */
-int gfs2_glockd(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        while (!kthread_should_stop()) {
-                while (atomic_read(&sdp->sd_reclaim_count))
-                        gfs2_reclaim_glock(sdp);
-                wait_event_interruptible(sdp->sd_reclaim_wq,
-                                         (atomic_read(&sdp->sd_reclaim_count) ||
-                                         kthread_should_stop()));
-                if (freezing(current))
-                        refrigerator();
-        }
-        return 0;
-}
-/**
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-int gfs2_recoverd(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        unsigned long t;
-        while (!kthread_should_stop()) {
-                gfs2_check_journals(sdp);
-                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
-        return 0;
-}
-/**
- * gfs2_quotad - Write cached quota changes into the quota file
- * @sdp: Pointer to GFS2 superblock
- *
- */
-int gfs2_quotad(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        unsigned long t;
-        int error;
-        while (!kthread_should_stop()) {
-                /* Update the master statfs file */
-                t = sdp->sd_statfs_sync_time +
-                    gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
-                if (time_after_eq(jiffies, t)) {
-                        error = gfs2_statfs_sync(sdp);
-                        if (error &&
-                            error != -EROFS &&
-                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                                fs_err(sdp, "quotad: (1) error=%d\n", error);
-                        sdp->sd_statfs_sync_time = jiffies;
-                }
-                /* Update quota file */
-                t = sdp->sd_quota_sync_time +
-                    gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
-                if (time_after_eq(jiffies, t)) {
-                        error = gfs2_quota_sync(sdp);
-                        if (error &&
-                            error != -EROFS &&
-                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                                fs_err(sdp, "quotad: (2) error=%d\n", error);
-                        sdp->sd_quota_sync_time = jiffies;
-                }
-                gfs2_quota_scan(sdp);
-                t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
-        return 0;
-}
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
deleted file mode 100644
index 4be084fb6a62..000000000000
--- a/fs/gfs2/daemon.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __DAEMON_DOT_H__
-#define __DAEMON_DOT_H__
-int gfs2_glockd(void *data);
-int gfs2_recoverd(void *data);
-int gfs2_quotad(void *data);
-#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index eed040d8ba3a..b7c8e5c70791 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -36,7 +36,7 @@
 * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
 * beginning of the leaf block. The dirents reside in leaves when
 *
- * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ * dip->i_diskflags & GFS2_DIF_EXHASH is true
 *
 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
 *
@@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-        if (ip->i_di.di_size < offset + size)
+        if (ip->i_disksize < offset + size)
-                ip->i_di.di_size = offset + size;
+                ip->i_disksize = offset + size;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
@@ -226,8 +226,8 @@ out:
        if (error)
                return error;
-        if (ip->i_di.di_size < offset + copied)
+        if (ip->i_disksize < offset + copied)
-                ip->i_di.di_size = offset + copied;
+                ip->i_disksize = offset + copied;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
        int copied = 0;
        int error = 0;
-        if (offset >= ip->i_di.di_size)
+        if (offset >= ip->i_disksize)
                return 0;
-        if (offset + size > ip->i_di.di_size)
+        if (offset + size > ip->i_disksize)
-                size = ip->i_di.di_size - offset;
+                size = ip->i_disksize - offset;
        if (!size)
                return 0;
@@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
        struct gfs2_inode *ip = GFS2_I(inode);
        int error;
-        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                struct gfs2_leaf *leaf;
                unsigned hsize = 1 << ip->i_depth;
                unsigned index;
                u64 ln;
-                if (hsize * sizeof(u64) != ip->i_di.di_size) {
+                if (hsize * sizeof(u64) != ip->i_disksize) {
                        gfs2_consist_inode(ip);
                        return ERR_PTR(-EIO);
                }
@@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode)
                return -ENOSPC;
        bn = bh->b_blocknr;
-        gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
+        gfs2_assert(sdp, dip->i_entries < (1 << 16));
-        leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+        leaf->lf_entries = cpu_to_be16(dip->i_entries);
        /*  Copy dirents  */
@@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode)
        for (x = sdp->sd_hash_ptrs; x--; lp++)
                *lp = cpu_to_be64(bn);
-        dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+        dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
        gfs2_add_inode_blocks(&dip->i_inode, 1);
-        dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+        dip->i_diskflags |= GFS2_DIF_EXHASH;
        for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
        dip->i_depth = y;
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        int error = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+        if (hsize * sizeof(u64) != dip->i_disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
@@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
-        for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+        for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
                error = gfs2_dir_read_data(dip, (char *)buf,
                                            block * sdp->sd_hash_bsize,
                                            sdp->sd_hash_bsize, 1);
@@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        unsigned depth = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+        if (hsize * sizeof(u64) != dip->i_disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
@@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
        int copied = 0;
        int error;
-        if (!dip->i_di.di_entries)
+        if (!dip->i_entries)
                return 0;
-        if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+        if (dip->i_diskflags & GFS2_DIF_EXHASH)
                return dir_e_read(inode, offset, opaque, filldir);
        if (!gfs2_is_stuffed(dip)) {
@@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                        error = PTR_ERR(dent);
                        goto out;
                }
-                if (dip->i_di.di_entries != g.offset) {
+                if (dip->i_entries != g.offset) {
                        fs_warn(sdp, "Number of entries corrupt in dir %llu, "
-                                "ip->i_di.di_entries (%u) != g.offset (%u)\n",
+                                "ip->i_entries (%u) != g.offset (%u)\n",
                                (unsigned long long)dip->i_no_addr,
-                                dip->i_di.di_entries,
+                                dip->i_entries,
                                g.offset);
                        error = -EIO;
                        goto out;
                }
                error = do_filldir_main(dip, offset, opaque, filldir, darr,
-                                        dip->i_di.di_entries, &copied);
+                                        dip->i_entries, &copied);
 out:
                kfree(darr);
        }
@@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        dent = gfs2_init_dirent(inode, dent, name, bh);
                        gfs2_inum_out(nip, dent);
                        dent->de_type = cpu_to_be16(type);
-                        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                                leaf = (struct gfs2_leaf *)bh->b_data;
                                be16_add_cpu(&leaf->lf_entries, 1);
                        }
@@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        if (error)
                                break;
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                        ip->i_di.di_entries++;
+                        ip->i_entries++;
                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                        gfs2_dinode_out(ip, bh->b_data);
                        brelse(bh);
                        error = 0;
                        break;
                }
-                if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+                if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
                        error = dir_make_exhash(inode);
                        if (error)
                                break;
@@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
        }
        dirent_del(dip, bh, prev, dent);
-        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+        if (dip->i_diskflags & GFS2_DIF_EXHASH) {
                struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
                u16 entries = be16_to_cpu(leaf->lf_entries);
                if (!entries)
@@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
        if (error)
                return error;
-        if (!dip->i_di.di_entries)
+        if (!dip->i_entries)
                gfs2_consist_inode(dip);
        gfs2_trans_add_bh(dip->i_gl, bh, 1);
-        dip->i_di.di_entries--;
+        dip->i_entries--;
        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(dip, bh->b_data);
        brelse(bh);
@@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
        gfs2_inum_out(nip, dent);
        dent->de_type = cpu_to_be16(new_type);
-        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+        if (dip->i_diskflags & GFS2_DIF_EXHASH) {
                brelse(bh);
                error = gfs2_meta_inode_buffer(dip, &bh);
                if (error)
@@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
        int error = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+        if (hsize * sizeof(u64) != dip->i_disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 8a468cac9328..4f919440c3be 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -11,6 +11,7 @@
 #define __DIR_DOT_H__
 #include <linux/dcache.h>
+#include <linux/crc32.h>
 struct inode;
 struct gfs2_inode;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index e3f76f451b0a..0d1c76d906ae 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
        __be64 *eablk, *end;
        int error;
-        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+        error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
        if (error)
                return error;
-        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+        if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
                error = ea_foreach_i(ip, bh, ea_call, data);
                goto out;
        }
@@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        if (error)
                return error;
-        if (ip->i_di.di_eattr) {
+        if (ip->i_eattr) {
                struct ea_list ei = { .ei_er = er, .ei_size = 0 };
                error = ea_foreach(ip, ea_list_i, &ei);
@@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        struct gfs2_ea_location el;
        int error;
-        if (!ip->i_di.di_eattr)
+        if (!ip->i_eattr)
                return -ENODATA;
        error = gfs2_ea_find(ip, er, &el);
@@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        ip->i_di.di_eattr = bh->b_blocknr;
+        ip->i_eattr = bh->b_blocknr;
        error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
        brelse(bh);
@@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        int error;
        int mh_size = sizeof(struct gfs2_meta_header);
-        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+        if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
                __be64 *end;
-                error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+                error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
                                       &indbh);
                if (error)
                        return error;
@@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                gfs2_buffer_clear_tail(indbh, mh_size);
                eablk = (__be64 *)(indbh->b_data + mh_size);
-                *eablk = cpu_to_be64(ip->i_di.di_eattr);
+                *eablk = cpu_to_be64(ip->i_eattr);
-                ip->i_di.di_eattr = blk;
+                ip->i_eattr = blk;
-                ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+                ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
                gfs2_add_inode_blocks(&ip->i_inode, 1);
                eablk++;
@@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+        if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
                blks++;
        if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
                blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
@@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        struct gfs2_ea_location el;
        int error;
-        if (!ip->i_di.di_eattr) {
+        if (!ip->i_eattr) {
                if (er->er_flags & XATTR_REPLACE)
                        return -ENODATA;
                return ea_init(ip, er);
@@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
                return error;
        if (el.el_ea) {
-                if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+                if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
                        brelse(el.el_bh);
                        return -EPERM;
                }
@@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        struct gfs2_ea_location el;
        int error;
-        if (!ip->i_di.di_eattr)
+        if (!ip->i_eattr)
                return -ENODATA;
        error = gfs2_ea_find(ip, er, &el);
@@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+        error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
        if (error)
                return error;
@@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        if (bstart)
                gfs2_free_meta(ip, bstart, blen);
-        ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+        ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
        struct buffer_head *dibh;
        int error;
-        rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
        if (!rgd) {
                gfs2_consist_inode(ip);
                return -EIO;
@@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
        if (error)
                goto out_gunlock;
-        gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+        gfs2_free_meta(ip, ip->i_eattr, 1);
-        ip->i_di.di_eattr = 0;
+        ip->i_eattr = 0;
        gfs2_add_inode_blocks(&ip->i_inode, -1);
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
        if (error)
                goto out_rindex;
-        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+        if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
                error = ea_dealloc_indirect(ip);
                if (error)
                        goto out_rindex;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c962283d4e7f..6b983aef785d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -40,6 +40,7 @@
 #include "quota.h"
 #include "super.h"
 #include "util.h"
+#include "bmap.h"
 struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
@@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
-static struct task_struct *scand_process;
-static unsigned int scand_secs = 5;
 static struct workqueue_struct *glock_workqueue;
+static LIST_HEAD(lru_list);
+static atomic_t lru_count = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(lru_lock);
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 }
 /**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+        spin_lock(&lru_lock);
+        if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+                list_add_tail(&gl->gl_lru, &lru_list);
+                atomic_inc(&lru_count);
+        }
+        spin_unlock(&lru_lock);
+}
+/**
 * gfs2_glock_put() - Decrement reference count on glock
 * @gl: The glock to put
 *
@@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl)
        if (atomic_dec_and_test(&gl->gl_ref)) {
                hlist_del(&gl->gl_list);
                write_unlock(gl_lock_addr(gl->gl_hash));
+                spin_lock(&lru_lock);
+                if (!list_empty(&gl->gl_lru)) {
+                        list_del_init(&gl->gl_lru);
+                        atomic_dec(&lru_count);
+                }
+                spin_unlock(&lru_lock);
                GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
                glock_free(gl);
                rv = 1;
                goto out;
        }
        write_unlock(gl_lock_addr(gl->gl_hash));
+        /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+        if (atomic_read(&gl->gl_ref) == 2)
+                gfs2_glock_schedule_for_reclaim(gl);
 out:
        return rv;
 }
@@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
 * do_promote - promote as many requests as possible on the current queue
 * @gl: The glock
 * 
- * Returns: true if there is a blocked holder at the head of the list
+ * Returns: 1 if there is a blocked holder at the head of the list, or 2
+ *          if a type specific operation is underway.
 */
 static int do_promote(struct gfs2_glock *gl)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_holder *gh, *tmp;
@@ -310,6 +340,8 @@ restart:
                                ret = glops->go_lock(gh);
                                spin_lock(&gl->gl_spin);
                                if (ret) {
+                                        if (ret == 1)
+                                                return 2;
                                        gh->gh_error = ret;
                                        list_del_init(&gh->gh_list);
                                        gfs2_holder_wake(gh);
@@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_holder *gh;
        unsigned state = ret & LM_OUT_ST_MASK;
+        int rv;
        spin_lock(&gl->gl_spin);
        state_change(gl, state);
@@ -468,7 +501,6 @@ retry:
                gfs2_demote_wake(gl);
        if (state != LM_ST_UNLOCKED) {
                if (glops->go_xmote_bh) {
-                        int rv;
                        spin_unlock(&gl->gl_spin);
                        rv = glops->go_xmote_bh(gl, gh);
                        if (rv == -EAGAIN)
@@ -479,10 +511,13 @@ retry:
                                goto out;
                        }
                }
-                do_promote(gl);
+                rv = do_promote(gl);
+                if (rv == 2)
+                        goto out_locked;
        }
 out:
        clear_bit(GLF_LOCK, &gl->gl_flags);
+out_locked:
        spin_unlock(&gl->gl_spin);
        gfs2_glock_put(gl);
 }
@@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
 */
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
 */
 static void run_queue(struct gfs2_glock *gl, const int nonblock)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        struct gfs2_holder *gh = NULL;
+        int ret;
        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
                return;
@@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
        } else {
                if (test_bit(GLF_DEMOTE, &gl->gl_flags))
                        gfs2_demote_wake(gl);
-                if (do_promote(gl) == 0)
+                ret = do_promote(gl);
+                if (ret == 0)
                        goto out;
+                if (ret == 2)
+                        return;
                gh = find_first_waiter(gl);
                gl->gl_target = gh->gh_state;
                if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
@@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl)
 */
 static void handle_callback(struct gfs2_glock *gl, unsigned int state,
-                            int remote, unsigned long delay)
+                            unsigned long delay)
 {
        int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
@@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
        if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
                gl->gl_demote_state = state;
                gl->gl_demote_time = jiffies;
-                if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-                    gl->gl_object)
-                        gfs2_glock_schedule_for_reclaim(gl);
        } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
                        gl->gl_demote_state != state) {
                gl->gl_demote_state = LM_ST_UNLOCKED;
@@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 */
 static inline void add_to_queue(struct gfs2_holder *gh)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        struct gfs2_glock *gl = gh->gh_gl;
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
        spin_lock(&gl->gl_spin);
        if (gh->gh_flags & GL_NOCACHE)
-                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                handle_callback(gl, LM_ST_UNLOCKED, 0);
        list_del_init(&gh->gh_list);
        if (find_first_holder(gl) == NULL) {
@@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
                delay = gl->gl_ops->go_min_hold_time;
        spin_lock(&gl->gl_spin);
-        handle_callback(gl, state, 1, delay);
+        handle_callback(gl, state, delay);
        spin_unlock(&gl->gl_spin);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
 }
+static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+        struct gfs2_jdesc *jd;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (jd->jd_jid != jid)
+                        continue;
+                jd->jd_dirty = 1;
+                break;
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+}
 /**
 * gfs2_glock_cb - Callback used by locking module
 * @sdp: Pointer to the superblock
@@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 * Returns: 1 if it's ok
 */
-static int demote_ok(struct gfs2_glock *gl)
+static int demote_ok(const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        int demote = 1;
-        if (test_bit(GLF_STICKY, &gl->gl_flags))
-                demote = 0;
-        else if (glops->go_demote_ok)
-                demote = glops->go_demote_ok(gl);
-        return demote;
-}
-/**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
- * @gl: the glock
- *
- */
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        spin_lock(&sdp->sd_reclaim_lock);
+        if (gl->gl_state == LM_ST_UNLOCKED)
-        if (list_empty(&gl->gl_reclaim)) {
+                return 0;
-                gfs2_glock_hold(gl);
+        if (!list_empty(&gl->gl_holders))
-                list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
+                return 0;
-                atomic_inc(&sdp->sd_reclaim_count);
+        if (glops->go_demote_ok)
-                spin_unlock(&sdp->sd_reclaim_lock);
+                return glops->go_demote_ok(gl);
-                wake_up(&sdp->sd_reclaim_wq);
+        return 1;
-        } else
-                spin_unlock(&sdp->sd_reclaim_lock);
 }
-/**
- * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
- * @sdp: the filesystem
- *
- * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
- * different glock and we notice that there are a lot of glocks in the
- * reclaim list.
- *
- */
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
-        int done_callback = 0;
+        int may_demote;
+        int nr_skipped = 0;
+        int got_ref = 0;
+        LIST_HEAD(skipped);
-        spin_lock(&sdp->sd_reclaim_lock);
+        if (nr == 0)
-        if (list_empty(&sdp->sd_reclaim_list)) {
+                goto out;
-                spin_unlock(&sdp->sd_reclaim_lock);
-                return;
-        }
-        gl = list_entry(sdp->sd_reclaim_list.next,
-                        struct gfs2_glock, gl_reclaim);
-        list_del_init(&gl->gl_reclaim);
-        spin_unlock(&sdp->sd_reclaim_lock);
-        atomic_dec(&sdp->sd_reclaim_count);
+        if (!(gfp_mask & __GFP_FS))
-        atomic_inc(&sdp->sd_reclaimed);
+                return -1;
-        spin_lock(&gl->gl_spin);
+        spin_lock(&lru_lock);
-        if (find_first_holder(gl) == NULL &&
+        while(nr && !list_empty(&lru_list)) {
-            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
+                gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
-                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                list_del_init(&gl->gl_lru);
-                done_callback = 1;
+                atomic_dec(&lru_count);
+                /* Test for being demotable */
+                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+                        gfs2_glock_hold(gl);
+                        got_ref = 1;
+                        spin_unlock(&lru_lock);
+                        spin_lock(&gl->gl_spin);
+                        may_demote = demote_ok(gl);
+                        spin_unlock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
+                        if (may_demote) {
+                                handle_callback(gl, LM_ST_UNLOCKED, 0);
+                                nr--;
+                                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                                        gfs2_glock_put(gl);
+                        }
+                        spin_lock(&lru_lock);
+                        if (may_demote)
+                                continue;
+                }
+                if (list_empty(&gl->gl_lru) &&
+                    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
+                        nr_skipped++;
+                        list_add(&gl->gl_lru, &skipped);
+                }
+                if (got_ref) {
+                        spin_unlock(&lru_lock);
+                        gfs2_glock_put(gl);
+                        spin_lock(&lru_lock);
+                        got_ref = 0;
+                }
        }
-        spin_unlock(&gl->gl_spin);
+        list_splice(&skipped, &lru_list);
-        if (!done_callback ||
+        atomic_add(nr_skipped, &lru_count);
-            queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+        spin_unlock(&lru_lock);
-                gfs2_glock_put(gl);
+out:
+        return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
 }
+static struct shrinker glock_shrinker = {
+        .shrink = gfs2_shrink_glock_memory,
+        .seeks = DEFAULT_SEEKS,
+};
 /**
 * examine_bucket - Call a function for glock in a hash bucket
 * @examiner: the function
@@ -1457,26 +1516,6 @@ out:
 }
 /**
- * scan_glock - look at a glock and see if we can reclaim it
- * @gl: the glock to look at
- *
- */
-static void scan_glock(struct gfs2_glock *gl)
-{
-        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
-                return;
-        if (test_bit(GLF_LOCK, &gl->gl_flags))
-                return;
-        spin_lock(&gl->gl_spin);
-        if (find_first_holder(gl) == NULL &&
-            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-                gfs2_glock_schedule_for_reclaim(gl);
-        spin_unlock(&gl->gl_spin);
-}
-/**
 * clear_glock - look at a glock and see if we can free it from glock cache
 * @gl: the glock to look at
 *
@@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl)
 static void clear_glock(struct gfs2_glock *gl)
 {
-        struct gfs2_sbd *sdp = gl->gl_sbd;
+        spin_lock(&lru_lock);
-        int released;
+        if (!list_empty(&gl->gl_lru)) {
+                list_del_init(&gl->gl_lru);
-        spin_lock(&sdp->sd_reclaim_lock);
+                atomic_dec(&lru_count);
-        if (!list_empty(&gl->gl_reclaim)) {
-                list_del_init(&gl->gl_reclaim);
-                atomic_dec(&sdp->sd_reclaim_count);
-                spin_unlock(&sdp->sd_reclaim_lock);
-                released = gfs2_glock_put(gl);
-                gfs2_assert(sdp, !released);
-        } else {
-                spin_unlock(&sdp->sd_reclaim_lock);
        }
+        spin_unlock(&lru_lock);
        spin_lock(&gl->gl_spin);
        if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
-                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                handle_callback(gl, LM_ST_UNLOCKED, 0);
        spin_unlock(&gl->gl_spin);
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
        }
 }
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
+{
+        struct gfs2_glock *gl = ip->i_gl;
+        int ret;
+        ret = gfs2_truncatei_resume(ip);
+        gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+        spin_lock(&gl->gl_spin);
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        run_queue(gl, 1);
+        spin_unlock(&gl->gl_spin);
+}
 static const char *state2str(unsigned state)
 {
        switch(state) {
@@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
        char *p = buf;
        if (test_bit(GLF_LOCK, gflags))
                *p++ = 'l';
-        if (test_bit(GLF_STICKY, gflags))
-                *p++ = 's';
        if (test_bit(GLF_DEMOTE, gflags))
                *p++ = 'D';
        if (test_bit(GLF_PENDING_DEMOTE, gflags))
@@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
        return error;
 }
-/**
- * gfs2_scand - Look for cached glocks and inodes to toss from memory
- * @sdp: Pointer to GFS2 superblock
- *
- * One of these daemons runs, finding candidates to add to sd_reclaim_list.
- * See gfs2_glockd()
- */
-static int gfs2_scand(void *data)
-{
-        unsigned x;
-        unsigned delay;
-        while (!kthread_should_stop()) {
-                for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-                        examine_bucket(scan_glock, NULL, x);
-                if (freezing(current))
-                        refrigerator();
-                delay = scand_secs;
-                if (delay < 1)
-                        delay = 1;
-                schedule_timeout_interruptible(delay * HZ);
-        }
-        return 0;
-}
 int __init gfs2_glock_init(void)
 {
@@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void)
        }
 #endif
-        scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
-        if (IS_ERR(scand_process))
-                return PTR_ERR(scand_process);
        glock_workqueue = create_workqueue("glock_workqueue");
-        if (IS_ERR(glock_workqueue)) {
+        if (IS_ERR(glock_workqueue))
-                kthread_stop(scand_process);
                return PTR_ERR(glock_workqueue);
-        }
+        register_shrinker(&glock_shrinker);
        return 0;
 }
 void gfs2_glock_exit(void)
 {
+        unregister_shrinker(&glock_shrinker);
        destroy_workqueue(glock_workqueue);
-        kthread_stop(scand_process);
 }
-module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 695c6b193611..543ec7ecfbda 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c6c318c2a0f6..8522d3aa64fc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 * Returns: 1 if it's ok
 */
-static int inode_go_demote_ok(struct gfs2_glock *gl)
+static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        int demote = 0;
+        if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
+                return 0;
-        if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
+        return 1;
-                demote = 1;
-        else if (!sdp->sd_args.ar_localcaching &&
-                 time_after_eq(jiffies, gl->gl_stamp +
-                               gfs2_tune_get(sdp, gt_demote_secs) * HZ))
-                demote = 1;
-        return demote;
 }
 /**
@@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl)
 static int inode_go_lock(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct gfs2_inode *ip = gl->gl_object;
        int error = 0;
@@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh)
                        return error;
        }
-        if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+        if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
            (gl->gl_state == LM_ST_EXCLUSIVE) &&
-            (gh->gh_state == LM_ST_EXCLUSIVE))
+            (gh->gh_state == LM_ST_EXCLUSIVE)) {
-                error = gfs2_truncatei_resume(ip);
+                spin_lock(&sdp->sd_trunc_lock);
+                if (list_empty(&ip->i_trunc_list))
+                        list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
+                spin_unlock(&sdp->sd_trunc_lock);
+                wake_up(&sdp->sd_quota_wait);
+                return 1;
+        }
        return error;
 }
@@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
        const struct gfs2_inode *ip = gl->gl_object;
        if (ip == NULL)
                return 0;
-        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
                  (unsigned long long)ip->i_no_formal_ino,
                  (unsigned long long)ip->i_no_addr,
-                  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+                  IF2DT(ip->i_inode.i_mode), ip->i_flags,
+                  (unsigned int)ip->i_diskflags,
+                  (unsigned long long)ip->i_inode.i_size,
+                  (unsigned long long)ip->i_disksize);
        return 0;
 }
@@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 * Returns: 1 if it's ok
 */
-static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
 {
        return !gl->gl_aspace->i_mapping->nrpages;
 }
@@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
        const struct gfs2_rgrpd *rgd = gl->gl_object;
        if (rgd == NULL)
                return 0;
-        gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
        return 0;
 }
@@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 }
 /**
+ * trans_go_demote_ok
+ * @gl: the glock
+ *
+ * Always returns 0
+ */
+static int trans_go_demote_ok(const struct gfs2_glock *gl)
+{
+        return 0;
+}
+/**
 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
 * @gl: the glock
 *
 * Returns: 1 if it's ok
 */
-static int quota_go_demote_ok(struct gfs2_glock *gl)
+static int quota_go_demote_ok(const struct gfs2_glock *gl)
 {
        return !atomic_read(&gl->gl_lvb_count);
 }
@@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 const struct gfs2_glock_operations gfs2_trans_glops = {
        .go_xmote_th = trans_go_sync,
        .go_xmote_bh = trans_go_xmote_bh,
+        .go_demote_ok = trans_go_demote_ok,
        .go_type = LM_TYPE_NONDISK,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f566ec1b4e8e..608849d00021 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,12 +68,6 @@ struct gfs2_bitmap {
        u32 bi_len;
 };
-struct gfs2_rgrp_host {
-        u32 rg_free;
-        u32 rg_dinodes;
-        u64 rg_igeneration;
-};
 struct gfs2_rgrpd {
        struct list_head rd_list;       /* Link with superblock */
        struct list_head rd_list_mru;
@@ -83,14 +77,16 @@ struct gfs2_rgrpd {
        u32 rd_length;                  /* length of rgrp header in fs blocks */
        u32 rd_data;                    /* num of data blocks in rgrp */
        u32 rd_bitbytes;                /* number of bytes in data bitmaps */
-        struct gfs2_rgrp_host rd_rg;
+        u32 rd_free;
+        u32 rd_free_clone;
+        u32 rd_dinodes;
+        u64 rd_igeneration;
        struct gfs2_bitmap *rd_bits;
-        unsigned int rd_bh_count;
        struct mutex rd_mutex;
-        u32 rd_free_clone;
        struct gfs2_log_element rd_le;
-        u32 rd_last_alloc;
        struct gfs2_sbd *rd_sbd;
+        unsigned int rd_bh_count;
+        u32 rd_last_alloc;
        unsigned char rd_flags;
 #define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
 #define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
@@ -129,7 +125,7 @@ struct gfs2_glock_operations {
        void (*go_xmote_th) (struct gfs2_glock *gl);
        int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
-        int (*go_demote_ok) (struct gfs2_glock *gl);
+        int (*go_demote_ok) (const struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
@@ -159,7 +155,6 @@ struct gfs2_holder {
 enum {
        GLF_LOCK                        = 1,
-        GLF_STICKY                      = 2,
        GLF_DEMOTE                      = 3,
        GLF_PENDING_DEMOTE              = 4,
        GLF_DEMOTE_IN_PROGRESS          = 5,
@@ -194,7 +189,7 @@ struct gfs2_glock {
        unsigned long gl_tchange;
        void *gl_object;
-        struct list_head gl_reclaim;
+        struct list_head gl_lru;
        struct gfs2_sbd *gl_sbd;
@@ -233,29 +228,24 @@ enum {
        GIF_USER                = 4, /* user inode, not metadata addr space */
 };
-struct gfs2_dinode_host {
-        u64 di_size;            /* number of bytes in file */
-        u64 di_generation;      /* generation number for NFS */
-        u32 di_flags;           /* GFS2_DIF_... */
-        /* These only apply to directories  */
-        u32 di_entries;         /* The number of entries in the directory */
-        u64 di_eattr;           /* extended attribute block number */
-};
 struct gfs2_inode {
        struct inode i_inode;
        u64 i_no_addr;
        u64 i_no_formal_ino;
+        u64 i_generation;
+        u64 i_eattr;
+        loff_t i_disksize;
        unsigned long i_flags;          /* GIF_... */
-        struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
        struct gfs2_glock *i_gl; /* Move into i_gh? */
        struct gfs2_holder i_iopen_gh;
        struct gfs2_holder i_gh; /* for prepare/commit_write only */
        struct gfs2_alloc *i_alloc;
        u64 i_goal;     /* goal block for allocations */
        struct rw_semaphore i_rw_mutex;
+        struct list_head i_trunc_list;
+        u32 i_entries;
+        u32 i_diskflags;
        u8 i_height;
        u8 i_depth;
 };
@@ -406,13 +396,11 @@ struct gfs2_args {
 struct gfs2_tune {
        spinlock_t gt_spin;
-        unsigned int gt_demote_secs; /* Cache retention for unheld glock */
        unsigned int gt_incore_log_blocks;
        unsigned int gt_log_flush_secs;
        unsigned int gt_recoverd_secs;
        unsigned int gt_logd_secs;
-        unsigned int gt_quotad_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
        unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
@@ -488,10 +476,6 @@ struct gfs2_sbd {
        /* Lock Stuff */
        struct lm_lockstruct sd_lockstruct;
-        struct list_head sd_reclaim_list;
-        spinlock_t sd_reclaim_lock;
-        wait_queue_head_t sd_reclaim_wq;
-        atomic_t sd_reclaim_count;
        struct gfs2_holder sd_live_gh;
        struct gfs2_glock *sd_rename_gl;
        struct gfs2_glock *sd_trans_gl;
@@ -519,7 +503,6 @@ struct gfs2_sbd {
        spinlock_t sd_statfs_spin;
        struct gfs2_statfs_change_host sd_statfs_master;
        struct gfs2_statfs_change_host sd_statfs_local;
-        unsigned long sd_statfs_sync_time;
        /* Resource group stuff */
@@ -552,8 +535,6 @@ struct gfs2_sbd {
        struct task_struct *sd_recoverd_process;
        struct task_struct *sd_logd_process;
        struct task_struct *sd_quotad_process;
-        struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
-        unsigned int sd_glockd_num;
        /* Quota stuff */
@@ -561,13 +542,15 @@ struct gfs2_sbd {
        atomic_t sd_quota_count;
        spinlock_t sd_quota_spin;
        struct mutex sd_quota_mutex;
+        wait_queue_head_t sd_quota_wait;
+        struct list_head sd_trunc_list;
+        spinlock_t sd_trunc_lock;
        unsigned int sd_quota_slots;
        unsigned int sd_quota_chunks;
        unsigned char **sd_quota_bitmap;
        u64 sd_quota_sync_gen;
-        unsigned long sd_quota_sync_time;
        /* Log stuff */
@@ -624,10 +607,6 @@ struct gfs2_sbd {
        struct mutex sd_freeze_lock;
        unsigned int sd_freeze_count;
-        /* Counters */
-        atomic_t sd_reclaimed;
        char sd_fsname[GFS2_FSNAME_LEN];
        char sd_table_name[GFS2_FSNAME_LEN];
        char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d57616840e89..3b87c188da41 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -32,7 +32,6 @@
 #include "log.h"
 #include "meta_io.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
@@ -248,7 +247,6 @@ fail:
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
-        struct gfs2_dinode_host *di = &ip->i_di;
        const struct gfs2_dinode *str = buf;
        struct timespec atime;
        u16 height, depth;
@@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
         * to do that.
         */
        ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-        di->di_size = be64_to_cpu(str->di_size);
+        ip->i_disksize = be64_to_cpu(str->di_size);
-        i_size_write(&ip->i_inode, di->di_size);
+        i_size_write(&ip->i_inode, ip->i_disksize);
        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
        atime.tv_sec = be64_to_cpu(str->di_atime);
        atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
        ip->i_goal = be64_to_cpu(str->di_goal_meta);
-        di->di_generation = be64_to_cpu(str->di_generation);
+        ip->i_generation = be64_to_cpu(str->di_generation);
-        di->di_flags = be32_to_cpu(str->di_flags);
+        ip->i_diskflags = be32_to_cpu(str->di_flags);
        gfs2_set_inode_flags(&ip->i_inode);
        height = be16_to_cpu(str->di_height);
        if (unlikely(height > GFS2_MAX_META_HEIGHT))
@@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
                goto corrupt;
        ip->i_depth = (u8)depth;
-        di->di_entries = be32_to_cpu(str->di_entries);
+        ip->i_entries = be32_to_cpu(str->di_entries);
-        di->di_eattr = be64_to_cpu(str->di_eattr);
+        ip->i_eattr = be64_to_cpu(str->di_eattr);
        if (S_ISREG(ip->i_inode.i_mode))
                gfs2_set_aops(&ip->i_inode);
@@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
        gfs2_free_di(rgd, ip);
        gfs2_trans_end(sdp);
-        clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
 out_rg_gunlock:
        gfs2_glock_dq_uninit(&al->al_rgd_gh);
@@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
                return error;
        }
-        if (dip->i_di.di_entries == (u32)-1)
+        if (dip->i_entries == (u32)-1)
                return -EFBIG;
        if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
                return -EMLINK;
@@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        di->di_flags = 0;
        if (S_ISREG(mode)) {
-                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+                if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
                    gfs2_tune_get(sdp, gt_new_files_jdata))
                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
        } else if (S_ISDIR(mode)) {
-                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+                di->di_flags |= cpu_to_be32(dip->i_diskflags &
                                            GFS2_DIF_INHERIT_JDATA);
        }
@@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
        struct qstr dotname;
        int error;
-        if (ip->i_di.di_entries != 2) {
+        if (ip->i_entries != 2) {
                if (gfs2_consist_inode(ip))
                        gfs2_dinode_print(ip);
                return -EIO;
@@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
                return error;
        }
-        if (!ip->i_di.di_size) {
+        if (!ip->i_disksize) {
                gfs2_consist_inode(ip);
                error = -EIO;
                goto out;
@@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
        if (error)
                goto out;
-        x = ip->i_di.di_size + 1;
+        x = ip->i_disksize + 1;
        if (x > *len) {
                *buf = kmalloc(x, GFP_NOFS);
                if (!*buf) {
@@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 {
-        const struct gfs2_dinode_host *di = &ip->i_di;
        struct gfs2_dinode *str = buf;
        str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
        str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
        str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-        str->di_size = cpu_to_be64(di->di_size);
+        str->di_size = cpu_to_be64(ip->i_disksize);
        str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
        str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
        str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_goal_meta = cpu_to_be64(ip->i_goal);
        str->di_goal_data = cpu_to_be64(ip->i_goal);
-        str->di_generation = cpu_to_be64(di->di_generation);
+        str->di_generation = cpu_to_be64(ip->i_generation);
-        str->di_flags = cpu_to_be32(di->di_flags);
+        str->di_flags = cpu_to_be32(ip->i_diskflags);
        str->di_height = cpu_to_be16(ip->i_height);
        str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
-                                             !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
+                                             !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
                                             GFS2_FORMAT_DE : 0);
        str->di_depth = cpu_to_be16(ip->i_depth);
-        str->di_entries = cpu_to_be32(di->di_entries);
+        str->di_entries = cpu_to_be32(ip->i_entries);
-        str->di_eattr = cpu_to_be64(di->di_eattr);
+        str->di_eattr = cpu_to_be64(ip->i_eattr);
        str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
        str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
        str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
@@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 void gfs2_dinode_print(const struct gfs2_inode *ip)
 {
-        const struct gfs2_dinode_host *di = &ip->i_di;
        printk(KERN_INFO "  no_formal_ino = %llu\n",
               (unsigned long long)ip->i_no_formal_ino);
        printk(KERN_INFO "  no_addr = %llu\n",
               (unsigned long long)ip->i_no_addr);
-        printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
+        printk(KERN_INFO "  i_disksize = %llu\n",
+               (unsigned long long)ip->i_disksize);
        printk(KERN_INFO "  blocks = %llu\n",
               (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
        printk(KERN_INFO "  i_goal = %llu\n",
               (unsigned long long)ip->i_goal);
-        printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
+        printk(KERN_INFO "  i_diskflags = 0x%.8X\n", ip->i_diskflags);
        printk(KERN_INFO "  i_height = %u\n", ip->i_height);
        printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
-        printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
+        printk(KERN_INFO "  i_entries = %u\n", ip->i_entries);
-        printk(KERN_INFO "  di_eattr = %llu\n",
+        printk(KERN_INFO "  i_eattr = %llu\n",
-               (unsigned long long)di->di_eattr);
+               (unsigned long long)ip->i_eattr);
 }
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2d43f69610a0..d5329364cdff 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,7 @@
 #ifndef __INODE_DOT_H__
 #define __INODE_DOT_H__
+#include <linux/fs.h>
 #include "util.h"
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
 {
-        return ip->i_di.di_flags & GFS2_DIF_JDATA;
+        return ip->i_diskflags & GFS2_DIF_JDATA;
 }
 static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
@@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
 void gfs2_dinode_print(const struct gfs2_inode *ip);
+extern const struct inode_operations gfs2_file_iops;
+extern const struct inode_operations gfs2_dir_iops;
+extern const struct inode_operations gfs2_symlink_iops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+extern void gfs2_set_inode_flags(struct inode *inode);
 #endif /* __INODE_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0c4cbe6c8285..1aa7eb6a0226 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -194,17 +194,25 @@ out:
 static void gdlm_recovery_done(void *lockspace, unsigned int jid,
                               unsigned int message)
 {
+        char env_jid[20];
+        char env_status[20];
+        char *envp[] = { env_jid, env_status, NULL };
        struct gdlm_ls *ls = lockspace;
        ls->recover_jid_done = jid;
        ls->recover_jid_status = message;
-        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+        sprintf(env_jid, "JID=%d", jid);
+        sprintf(env_status, "RECOVERY=%s",
+                message == LM_RD_SUCCESS ? "Done" : "Failed");
+        kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 static void gdlm_others_may_mount(void *lockspace)
 {
+        char *message = "FIRSTMOUNT=Done";
+        char *envp[] = { message, NULL };
        struct gdlm_ls *ls = lockspace;
        ls->first_done = 1;
-        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+        kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 /* Userspace gets the offline uevent, blocks new gfs locks on
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4ec571c3d8a9..9b7edcf7bd49 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
        kobject_put(&ls->kobj);
 }
+static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
+                       struct kobj_uevent_env *env)
+{
+        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+        add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
+        add_uevent_var(env, "LOCKPROTO=lock_dlm");
+        return 0;
+}
+static struct kset_uevent_ops gdlm_uevent_ops = {
+        .uevent = gdlm_uevent,
+};
 int gdlm_sysfs_init(void)
 {
-        gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
+        gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
        if (!gdlm_kset) {
                printk(KERN_WARNING "%s: can not create kset\n", __func__);
                return -ENOMEM;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bb2cc303ac29..7cacfde32194 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -19,7 +19,7 @@
 #include "gfs2.h"
 #include "incore.h"
-#include "ops_fstype.h"
+#include "super.h"
 #include "sys.h"
 #include "util.h"
 #include "glock.h"
@@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo)
        inode_init_once(&ip->i_inode);
        init_rwsem(&ip->i_rw_mutex);
+        INIT_LIST_HEAD(&ip->i_trunc_list);
        ip->i_alloc = NULL;
 }
@@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo)
        INIT_LIST_HEAD(&gl->gl_holders);
        gl->gl_lvb = NULL;
        atomic_set(&gl->gl_lvb_count, 0);
-        INIT_LIST_HEAD(&gl->gl_reclaim);
+        INIT_LIST_HEAD(&gl->gl_lru);
        INIT_LIST_HEAD(&gl->gl_ail_list);
        atomic_set(&gl->gl_ail_count, 0);
 }
@@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_rgrpd_cachep)
                goto fail;
+        gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
+                                               sizeof(struct gfs2_quota_data),
+                                               0, 0, NULL);
+        if (!gfs2_quotad_cachep)
+                goto fail;
        error = register_filesystem(&gfs2_fs_type);
        if (error)
                goto fail;
@@ -112,6 +119,9 @@ fail_unregister:
 fail:
        gfs2_glock_exit();
+        if (gfs2_quotad_cachep)
+                kmem_cache_destroy(gfs2_quotad_cachep);
        if (gfs2_rgrpd_cachep)
                kmem_cache_destroy(gfs2_rgrpd_cachep);
@@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
+        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
        kmem_cache_destroy(gfs2_inode_cachep);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f96eb90a2cfa..3cb0a44ba023 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -32,7 +32,6 @@ enum {
        Opt_debug,
        Opt_nodebug,
        Opt_upgrade,
-        Opt_num_glockd,
        Opt_acl,
        Opt_noacl,
        Opt_quota_off,
@@ -57,7 +56,6 @@ static const match_table_t tokens = {
        {Opt_debug, "debug"},
        {Opt_nodebug, "nodebug"},
        {Opt_upgrade, "upgrade"},
-        {Opt_num_glockd, "num_glockd=%d"},
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_quota_off, "quota=off"},
@@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
        int error = 0;
        if (!remount) {
-                /*  If someone preloaded options, use those instead  */
-                spin_lock(&gfs2_sys_margs_lock);
-                if (gfs2_sys_margs) {
-                        data = gfs2_sys_margs;
-                        gfs2_sys_margs = NULL;
-                }
-                spin_unlock(&gfs2_sys_margs_lock);
                /*  Set some defaults  */
-                args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
                args->ar_quota = GFS2_QUOTA_DEFAULT;
                args->ar_data = GFS2_DATA_DEFAULT;
        }
@@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
           process them */
        for (options = data; (o = strsep(&options, ",")); ) {
-                int token, option;
+                int token;
                substring_t tmp[MAX_OPT_ARGS];
                if (!*o)
@@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
                                goto cant_remount;
                        args->ar_upgrade = 1;
                        break;
-                case Opt_num_glockd:
-                        if ((error = match_int(&tmp[0], &option))) {
-                                fs_info(sdp, "problem getting num_glockd\n");
-                                goto out_error;
-                        }
-                        if (remount && option != args->ar_num_glockd)
-                                goto cant_remount;
-                        if (!option || option > GFS2_GLOCKD_MAX) {
-                                fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
-                                        GFS2_GLOCKD_MAX, option);
-                                error = -EINVAL;
-                                goto out_error;
-                        }
-                        args->ar_num_glockd = option;
-                        break;
                case Opt_acl:
                        args->ar_posix_acl = 1;
                        sdp->sd_vfs->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 27563816e1c5..4ddab67867eb 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
 {
        struct inode *inode = page->mapping->host;
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        int error;
+        int ret;
        int done_trans = 0;
-        error = gfs2_writepage_common(page, wbc);
-        if (error <= 0)
-                return error;
        if (PageChecked(page)) {
                if (wbc->sync_mode != WB_SYNC_ALL)
                        goto out_ignore;
-                error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+                ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
-                if (error)
+                if (ret)
                        goto out_ignore;
                done_trans = 1;
        }
-        error = __gfs2_jdata_writepage(page, wbc);
+        ret = gfs2_writepage_common(page, wbc);
+        if (ret > 0)
+                ret = __gfs2_jdata_writepage(page, wbc);
        if (done_trans)
                gfs2_trans_end(sdp);
-        return error;
+        return ret;
 out_ignore:
        redirty_page_for_writepage(wbc, page);
@@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
        kaddr = kmap_atomic(page, KM_USER0);
        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-               ip->i_di.di_size);
+               ip->i_disksize);
-        memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+        memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
        kunmap_atomic(kaddr, KM_USER0);
        flush_dcache_page(page);
        brelse(dibh);
@@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
        struct gfs2_inode *ip = GFS2_I(mapping->host);
        struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-        unsigned int data_blocks, ind_blocks, rblocks;
+        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
        int alloc_required;
        int error = 0;
        struct gfs2_alloc *al;
@@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (unlikely(error))
                goto out_uninit;
-        gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
        if (error)
                goto out_unlock;
+        if (alloc_required || gfs2_is_jdata(ip))
+                gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
                if (!al) {
@@ -675,7 +675,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                goto out_trans_fail;
        error = -ENOMEM;
-        page = __grab_cache_page(mapping, index);
+        flags |= AOP_FLAG_NOFS;
+        page = grab_cache_page_write_begin(mapping, index, flags);
        *pagep = page;
        if (unlikely(!page))
                goto out_endtrans;
@@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        if (inode->i_size < to) {
                i_size_write(inode, to);
-                ip->i_di.di_size = inode->i_size;
+                ip->i_disksize = inode->i_size;
                di->di_size = cpu_to_be64(inode->i_size);
                mark_inode_dirty(inode);
        }
@@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
+        if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
                di = (struct gfs2_dinode *)dibh->b_data;
-                ip->i_di.di_size = inode->i_size;
+                ip->i_disksize = inode->i_size;
                di->di_size = cpu_to_be64(inode->i_size);
                mark_inode_dirty(inode);
        }
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 4a5e676b4420..c2ad36330ca3 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -19,7 +19,7 @@
 #include "incore.h"
 #include "dir.h"
 #include "glock.h"
-#include "ops_dentry.h"
+#include "super.h"
 #include "util.h"
 #include "inode.h"
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
deleted file mode 100644
index 5caa3db4d3f5..000000000000
--- a/fs/gfs2/ops_dentry.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_DENTRY_DOT_H__
-#define __OPS_DENTRY_DOT_H__
-#include <linux/dcache.h>
-extern struct dentry_operations gfs2_dops;
-#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index bbb8c36403a9..7fdeb14ddd1a 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,8 +22,7 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "ops_dentry.h"
+#include "super.h"
-#include "ops_fstype.h"
 #include "rgrp.h"
 #include "util.h"
@@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        }
        error = -EIO;
-        if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+        if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
                iput(inode);
                goto fail;
        }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3a747f8e2188..93fe41b67f97 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -39,7 +39,6 @@
 #include "util.h"
 #include "eaops.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 /**
 * gfs2_llseek - seek to a location in a file
@@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
        if (error)
                return error;
-        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
+        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
-        if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+        if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
                fsflags |= FS_JOURNAL_DATA_FL;
        if (put_user(fsflags, ptr))
                error = -EFAULT;
@@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 void gfs2_set_inode_flags(struct inode *inode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_dinode_host *di = &ip->i_di;
        unsigned int flags = inode->i_flags;
        flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-        if (di->di_flags & GFS2_DIF_IMMUTABLE)
+        if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
                flags |= S_IMMUTABLE;
-        if (di->di_flags & GFS2_DIF_APPENDONLY)
+        if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
                flags |= S_APPEND;
-        if (di->di_flags & GFS2_DIF_NOATIME)
+        if (ip->i_diskflags & GFS2_DIF_NOATIME)
                flags |= S_NOATIME;
-        if (di->di_flags & GFS2_DIF_SYNC)
+        if (ip->i_diskflags & GFS2_DIF_SYNC)
                flags |= S_SYNC;
        inode->i_flags = flags;
 }
@@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_drop_write;
-        flags = ip->i_di.di_flags;
+        flags = ip->i_diskflags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
                goto out;
@@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_trans_end;
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-        ip->i_di.di_flags = new_flags;
+        ip->i_diskflags = new_flags;
        gfs2_dinode_out(ip, bh->b_data);
        brelse(bh);
        gfs2_set_inode_flags(inode);
@@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        unsigned long last_index;
-        u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
+        u64 pos = page->index << PAGE_CACHE_SHIFT;
        unsigned int data_blocks, ind_blocks, rblocks;
        int alloc_required = 0;
        struct gfs2_holder gh;
@@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                goto out;
        set_bit(GIF_SW_PAGED, &ip->i_flags);
-        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
        ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
        if (ret || !alloc_required)
                goto out_unlock;
@@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        ret = gfs2_quota_lock_check(ip);
        if (ret)
                goto out_alloc_put;
+        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
        al->al_requested = data_blocks + ind_blocks;
        ret = gfs2_inplace_reserve(ip);
        if (ret)
@@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
                        goto fail;
                if (!(file->f_flags & O_LARGEFILE) &&
-                    ip->i_di.di_size > MAX_NON_LFS) {
+                    ip->i_disksize > MAX_NON_LFS) {
                        error = -EOVERFLOW;
                        goto fail_gunlock;
                }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b117fcf2c4f5..f91eebdde581 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -22,20 +22,18 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "bmap.h"
-#include "daemon.h"
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
 #include "mount.h"
-#include "ops_fstype.h"
-#include "ops_dentry.h"
-#include "ops_super.h"
 #include "recovery.h"
 #include "rgrp.h"
 #include "super.h"
 #include "sys.h"
 #include "util.h"
 #include "log.h"
+#include "quota.h"
+#include "dir.h"
 #define DO 0
 #define UNDO 1
@@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
        spin_lock_init(&gt->gt_spin);
-        gt->gt_demote_secs = 300;
        gt->gt_incore_log_blocks = 1024;
        gt->gt_log_flush_secs = 60;
        gt->gt_recoverd_secs = 60;
        gt->gt_logd_secs = 1;
-        gt->gt_quotad_secs = 5;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
@@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        gfs2_tune_init(&sdp->sd_tune);
-        INIT_LIST_HEAD(&sdp->sd_reclaim_list);
-        spin_lock_init(&sdp->sd_reclaim_lock);
-        init_waitqueue_head(&sdp->sd_reclaim_wq);
        mutex_init(&sdp->sd_inum_mutex);
        spin_lock_init(&sdp->sd_statfs_spin);
@@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        INIT_LIST_HEAD(&sdp->sd_quota_list);
        spin_lock_init(&sdp->sd_quota_spin);
        mutex_init(&sdp->sd_quota_mutex);
+        init_waitqueue_head(&sdp->sd_quota_wait);
+        INIT_LIST_HEAD(&sdp->sd_trunc_list);
+        spin_lock_init(&sdp->sd_trunc_lock);
        spin_lock_init(&sdp->sd_log_lock);
@@ -443,24 +438,11 @@ out:
 static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
                        int undo)
 {
-        struct task_struct *p;
        int error = 0;
        if (undo)
                goto fail_trans;
-        for (sdp->sd_glockd_num = 0;
-             sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
-             sdp->sd_glockd_num++) {
-                p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
-                error = IS_ERR(p);
-                if (error) {
-                        fs_err(sdp, "can't start glockd thread: %d\n", error);
-                        goto fail;
-                }
-                sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
-        }
        error = gfs2_glock_nq_num(sdp,
                                  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
                                  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
@@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
                fs_err(sdp, "can't create transaction glock: %d\n", error);
                goto fail_rename;
        }
-        set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
        return 0;
@@ -506,9 +487,6 @@ fail_live:
 fail_mount:
        gfs2_glock_dq_uninit(mount_gh);
 fail:
-        while (sdp->sd_glockd_num--)
-                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
        return error;
 }
@@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
        prev_db = 0;
-        for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
+        for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
                bh.b_state = 0;
                bh.b_blocknr = 0;
                bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
                                        sdp->sd_lockstruct.ls_lockspace);
 }
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * Returns: errno
+ */
+static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+        struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+        struct qstr name;
+        char buf[20];
+        struct gfs2_jdesc *jd;
+        int error;
+        name.name = buf;
+        mutex_lock(&sdp->sd_jindex_mutex);
+        for (;;) {
+                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
+                if (error)
+                        break;
+                name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+                name.hash = gfs2_disk_hash(name.name, name.len);
+                error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
+                if (error == -ENOENT) {
+                        error = 0;
+                        break;
+                }
+                gfs2_glock_dq_uninit(ji_gh);
+                if (error)
+                        break;
+                error = -ENOMEM;
+                jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+                if (!jd)
+                        break;
+                INIT_LIST_HEAD(&jd->extent_list);
+                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
+                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+                        if (!jd->jd_inode)
+                                error = -ENOENT;
+                        else
+                                error = PTR_ERR(jd->jd_inode);
+                        kfree(jd);
+                        break;
+                }
+                spin_lock(&sdp->sd_jindex_spin);
+                jd->jd_jid = sdp->sd_journals++;
+                list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+                spin_unlock(&sdp->sd_jindex_spin);
+        }
+        mutex_unlock(&sdp->sd_jindex_mutex);
+        return error;
+}
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
        struct inode *master = sdp->sd_master_dir->d_inode;
@@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                return PTR_ERR(sdp->sd_jindex);
        }
        ip = GFS2_I(sdp->sd_jindex);
-        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
        /* Load in the journal index special file */
@@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
                goto fail_statfs;
        }
        ip = GFS2_I(sdp->sd_rindex);
-        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
        sdp->sd_rindex_uptodate = 0;
        /* Read in the quota inode */
@@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
        }
        sdp->sd_logd_process = p;
-        sdp->sd_statfs_sync_time = jiffies;
-        sdp->sd_quota_sync_time = jiffies;
        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
        error = IS_ERR(p);
        if (error) {
@@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 static void gfs2_kill_sb(struct super_block *sb)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        if (sdp) {
-                gfs2_meta_syncfs(sdp);
+        if (sdp == NULL) {
-                dput(sdp->sd_root_dir);
+                kill_block_super(sb);
-                dput(sdp->sd_master_dir);
+                return;
-                sdp->sd_root_dir = NULL;
-                sdp->sd_master_dir = NULL;
        }
+        gfs2_meta_syncfs(sdp);
+        dput(sdp->sd_root_dir);
+        dput(sdp->sd_master_dir);
+        sdp->sd_root_dir = NULL;
+        sdp->sd_master_dir = NULL;
        shrink_dcache_sb(sb);
        kill_block_super(sb);
-        if (sdp)
+        gfs2_delete_debugfs_file(sdp);
-                gfs2_delete_debugfs_file(sdp);
+        kfree(sdp);
 }
 struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
deleted file mode 100644
index da8490511836..000000000000
--- a/fs/gfs2/ops_fstype.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_FSTYPE_DOT_H__
-#define __OPS_FSTYPE_DOT_H__
-#include <linux/fs.h>
-extern struct file_system_type gfs2_fs_type;
-extern struct file_system_type gfs2meta_fs_type;
-extern const struct export_operations gfs2_export_ops;
-#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d232991b9046..49877546beb9 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -19,6 +19,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/fiemap.h>
 #include <asm/uaccess.h>
 #include "gfs2.h"
@@ -31,12 +32,11 @@
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
-#include "ops_dentry.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "super.h"
 /**
 * gfs2_create - Create a file
@@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (!dip->i_inode.i_nlink)
                goto out_gunlock;
        error = -EFBIG;
-        if (dip->i_di.di_entries == (u32)-1)
+        if (dip->i_entries == (u32)-1)
                goto out_gunlock;
        error = -EPERM;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        ip = ghs[1].gh_gl->gl_object;
-        ip->i_di.di_size = size;
+        ip->i_disksize = size;
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        ip = ghs[1].gh_gl->gl_object;
        ip->i_inode.i_nlink = 2;
-        ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+        ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
-        ip->i_di.di_flags |= GFS2_DIF_JDATA;
+        ip->i_diskflags |= GFS2_DIF_JDATA;
-        ip->i_di.di_entries = 2;
+        ip->i_entries = 2;
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        if (error)
                goto out_gunlock;
-        if (ip->i_di.di_entries < 2) {
+        if (ip->i_entries < 2) {
                if (gfs2_consist_inode(ip))
                        gfs2_dinode_print(ip);
                error = -EIO;
                goto out_gunlock;
        }
-        if (ip->i_di.di_entries > 2) {
+        if (ip->i_entries > 2) {
                error = -ENOTEMPTY;
                goto out_gunlock;
        }
@@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        goto out_gunlock;
                if (S_ISDIR(nip->i_inode.i_mode)) {
-                        if (nip->i_di.di_entries < 2) {
+                        if (nip->i_entries < 2) {
                                if (gfs2_consist_inode(nip))
                                        gfs2_dinode_print(nip);
                                error = -EIO;
                                goto out_gunlock;
                        }
-                        if (nip->i_di.di_entries > 2) {
+                        if (nip->i_entries > 2) {
                                error = -ENOTEMPTY;
                                goto out_gunlock;
                        }
@@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                                error = -EINVAL;
                                goto out_gunlock;
                        }
-                        if (ndip->i_di.di_entries == (u32)-1) {
+                        if (ndip->i_entries == (u32)-1) {
                                error = -EFBIG;
                                goto out_gunlock;
                        }
@@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        int error;
-        if (attr->ia_size != ip->i_di.di_size) {
+        if (attr->ia_size != ip->i_disksize) {
                error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
                if (error)
                        return error;
@@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
        }
        error = gfs2_truncatei(ip, attr->ia_size);
-        if (error && (inode->i_size != ip->i_di.di_size))
+        if (error && (inode->i_size != ip->i_disksize))
-                i_size_write(inode, ip->i_di.di_size);
+                i_size_write(inode, ip->i_disksize);
        return error;
 }
@@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
        return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
 }
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                       u64 start, u64 len)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int ret;
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+        if (ret)
+                goto out;
+        if (gfs2_is_stuffed(ip)) {
+                u64 phys = ip->i_no_addr << inode->i_blkbits;
+                u64 size = i_size_read(inode);
+                u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+                            FIEMAP_EXTENT_DATA_INLINE;
+                phys += sizeof(struct gfs2_dinode);
+                phys += start;
+                if (start + len > size)
+                        len = size - start;
+                if (start < size)
+                        ret = fiemap_fill_next_extent(fieinfo, start, phys,
+                                                      len, flags);
+                if (ret == 1)
+                        ret = 0;
+        } else {
+                ret = __generic_block_fiemap(inode, fieinfo, start, len,
+                                             gfs2_block_map);
+        }
+        gfs2_glock_dq_uninit(&gh);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 const struct inode_operations gfs2_file_iops = {
        .permission = gfs2_permission,
        .setattr = gfs2_setattr,
@@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
 };
 const struct inode_operations gfs2_dir_iops = {
@@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
 };
 const struct inode_operations gfs2_symlink_iops = {
@@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
 };
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
deleted file mode 100644
index 14b4b797622a..000000000000
--- a/fs/gfs2/ops_inode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_INODE_DOT_H__
-#define __OPS_INODE_DOT_H__
-#include <linux/fs.h>
-extern const struct inode_operations gfs2_file_iops;
-extern const struct inode_operations gfs2_dir_iops;
-extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations gfs2_dir_fops_nolock;
-extern void gfs2_set_inode_flags(struct inode *inode);
-#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index d5355d9b5926..320323d03479 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -28,7 +28,6 @@
 #include "inode.h"
 #include "log.h"
 #include "mount.h"
-#include "ops_super.h"
 #include "quota.h"
 #include "recovery.h"
 #include "rgrp.h"
@@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb)
        kthread_stop(sdp->sd_quotad_process);
        kthread_stop(sdp->sd_logd_process);
        kthread_stop(sdp->sd_recoverd_process);
-        while (sdp->sd_glockd_num--)
-                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
        if (!(sb->s_flags & MS_RDONLY)) {
                error = gfs2_make_fs_ro(sdp);
@@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb)
        /*  At this point, we're through participating in the lockspace  */
        gfs2_sys_fs_del(sdp);
-        kfree(sdp);
 }
 /**
@@ -215,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
 }
 /**
- * gfs2_write_super_lockfs - prevent further writes to the filesystem
+ * gfs2_freeze - prevent further writes to the filesystem
 * @sb: the VFS structure for the filesystem
 *
 */
-static void gfs2_write_super_lockfs(struct super_block *sb)
+static int gfs2_freeze(struct super_block *sb)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        int error;
        if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return;
+                return -EINVAL;
        for (;;) {
                error = gfs2_freeze_fs(sdp);
@@ -246,17 +242,150 @@ static void gfs2_write_super_lockfs(struct super_block *sb)
                fs_err(sdp, "retrying...\n");
                msleep(1000);
        }
+        return 0;
 }
 /**
- * gfs2_unlockfs - reallow writes to the filesystem
+ * gfs2_unfreeze - reallow writes to the filesystem
 * @sb: the VFS structure for the filesystem
 *
 */
-static void gfs2_unlockfs(struct super_block *sb)
+static int gfs2_unfreeze(struct super_block *sb)
 {
        gfs2_unfreeze_fs(sb->s_fs_info);
+        return 0;
+}
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+                            struct gfs2_statfs_change_host *sc)
+{
+        gfs2_rgrp_verify(rgd);
+        sc->sc_total += rgd->rd_data;
+        sc->sc_free += rgd->rd_free;
+        sc->sc_dinodes += rgd->rd_dinodes;
+        return 0;
+}
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_holder ri_gh;
+        struct gfs2_rgrpd *rgd_next;
+        struct gfs2_holder *gha, *gh;
+        unsigned int slots = 64;
+        unsigned int x;
+        int done;
+        int error = 0, err;
+        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!gha)
+                return -ENOMEM;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto out;
+        rgd_next = gfs2_rgrpd_get_first(sdp);
+        for (;;) {
+                done = 1;
+                for (x = 0; x < slots; x++) {
+                        gh = gha + x;
+                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
+                                err = gfs2_glock_wait(gh);
+                                if (err) {
+                                        gfs2_holder_uninit(gh);
+                                        error = err;
+                                } else {
+                                        if (!error)
+                                                error = statfs_slow_fill(
+                                                        gh->gh_gl->gl_object, sc);
+                                        gfs2_glock_dq_uninit(gh);
+                                }
+                        }
+                        if (gh->gh_gl)
+                                done = 0;
+                        else if (rgd_next && !error) {
+                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
+                                                           LM_ST_SHARED,
+                                                           GL_ASYNC,
+                                                           gh);
+                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
+                                done = 0;
+                        }
+                        if (signal_pending(current))
+                                error = -ERESTARTSYS;
+                }
+                if (done)
+                        break;
+                yield();
+        }
+        gfs2_glock_dq_uninit(&ri_gh);
+out:
+        kfree(gha);
+        return error;
+}
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        spin_lock(&sdp->sd_statfs_spin);
+        *sc = *m_sc;
+        sc->sc_total += l_sc->sc_total;
+        sc->sc_free += l_sc->sc_free;
+        sc->sc_dinodes += l_sc->sc_dinodes;
+        spin_unlock(&sdp->sd_statfs_spin);
+        if (sc->sc_free < 0)
+                sc->sc_free = 0;
+        if (sc->sc_free > sc->sc_total)
+                sc->sc_free = sc->sc_total;
+        if (sc->sc_dinodes < 0)
+                sc->sc_dinodes = 0;
+        return 0;
 }
 /**
@@ -370,7 +499,6 @@ static void gfs2_clear_inode(struct inode *inode)
         */
        if (test_bit(GIF_USER, &ip->i_flags)) {
                ip->i_gl->gl_object = NULL;
-                gfs2_glock_schedule_for_reclaim(ip->i_gl);
                gfs2_glock_put(ip->i_gl);
                ip->i_gl = NULL;
                if (ip->i_iopen_gh.gh_gl) {
@@ -423,8 +551,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                seq_printf(s, ",debug");
        if (args->ar_upgrade)
                seq_printf(s, ",upgrade");
-        if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
-                seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
        if (args->ar_posix_acl)
                seq_printf(s, ",acl");
        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -494,16 +620,16 @@ static void gfs2_delete_inode(struct inode *inode)
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
        error = gfs2_glock_nq(&ip->i_iopen_gh);
        if (error)
-                goto out_uninit;
+                goto out_truncate;
        if (S_ISDIR(inode->i_mode) &&
-            (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+            (ip->i_diskflags & GFS2_DIF_EXHASH)) {
                error = gfs2_dir_exhash_dealloc(ip);
                if (error)
                        goto out_unlock;
        }
-        if (ip->i_di.di_eattr) {
+        if (ip->i_eattr) {
                error = gfs2_ea_dealloc(ip);
                if (error)
                        goto out_unlock;
@@ -519,6 +645,7 @@ static void gfs2_delete_inode(struct inode *inode)
        if (error)
                goto out_unlock;
+out_truncate:
        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
        if (error)
                goto out_unlock;
@@ -527,8 +654,8 @@ static void gfs2_delete_inode(struct inode *inode)
        gfs2_trans_end(sdp);
 out_unlock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
+        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
-out_uninit:
+                gfs2_glock_dq(&ip->i_iopen_gh);
        gfs2_holder_uninit(&ip->i_iopen_gh);
        gfs2_glock_dq_uninit(&gh);
        if (error && error != GLR_TRYFAILED)
@@ -563,8 +690,8 @@ const struct super_operations gfs2_super_ops = {
        .put_super              = gfs2_put_super,
        .write_super            = gfs2_write_super,
        .sync_fs                = gfs2_sync_fs,
-        .write_super_lockfs     = gfs2_write_super_lockfs,
+        .freeze_fs              = gfs2_freeze,
-        .unlockfs               = gfs2_unlockfs,
+        .unfreeze_fs            = gfs2_unfreeze,
        .statfs                 = gfs2_statfs,
        .remount_fs             = gfs2_remount_fs,
        .clear_inode            = gfs2_clear_inode,
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
deleted file mode 100644
index 442a274c6272..000000000000
--- a/fs/gfs2/ops_super.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_SUPER_DOT_H__
-#define __OPS_SUPER_DOT_H__
-#include <linux/fs.h>
-extern const struct super_operations gfs2_super_ops;
-#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e073f5144fa..b08d09696b3e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -46,6 +46,8 @@
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
        struct gfs2_quota_data *qd;
        int error;
-        qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
+        qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
        if (!qd)
                return -ENOMEM;
@@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
        return 0;
 fail:
-        kfree(qd);
+        kmem_cache_free(gfs2_quotad_cachep, qd);
        return error;
 }
@@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
                if (qd || !create) {
                        if (new_qd) {
                                gfs2_lvb_unhold(new_qd->qd_gl);
-                                kfree(new_qd);
+                                kmem_cache_free(gfs2_quotad_cachep, new_qd);
                        }
                        *qdp = qd;
                        return 0;
@@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
                return;
-        if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+        if (ip->i_diskflags & GFS2_DIF_SYSTEM)
                return;
        for (x = 0; x < al->al_qd_num; x++) {
@@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-        unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
        unsigned int x, slot = 0;
        unsigned int found = 0;
        u64 dblock;
        u32 extlen = 0;
        int error;
-        if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
+        if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
-            ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+            ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
                gfs2_consist_inode(ip);
                return -EIO;
        }
@@ -1195,7 +1197,7 @@ fail:
        return error;
 }
-void gfs2_quota_scan(struct gfs2_sbd *sdp)
+static void gfs2_quota_scan(struct gfs2_sbd *sdp)
 {
        struct gfs2_quota_data *qd, *safe;
        LIST_HEAD(dead);
@@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp)
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
                gfs2_lvb_unhold(qd->qd_gl);
-                kfree(qd);
+                kmem_cache_free(gfs2_quotad_cachep, qd);
        }
 }
@@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
                gfs2_lvb_unhold(qd->qd_gl);
-                kfree(qd);
+                kmem_cache_free(gfs2_quotad_cachep, qd);
                spin_lock(&sdp->sd_quota_spin);
        }
@@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
        }
 }
+static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
+{
+        if (error == 0 || error == -EROFS)
+                return;
+        if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
+}
+static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
+                               int (*fxn)(struct gfs2_sbd *sdp),
+                               unsigned long t, unsigned long *timeo,
+                               unsigned int *new_timeo)
+{
+        if (t >= *timeo) {
+                int error = fxn(sdp);
+                quotad_error(sdp, msg, error);
+                *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
+        } else {
+                *timeo -= t;
+        }
+}
+static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *ip;
+        while(1) {
+                ip = NULL;
+                spin_lock(&sdp->sd_trunc_lock);
+                if (!list_empty(&sdp->sd_trunc_list)) {
+                        ip = list_entry(sdp->sd_trunc_list.next,
+                                        struct gfs2_inode, i_trunc_list);
+                        list_del_init(&ip->i_trunc_list);
+                }
+                spin_unlock(&sdp->sd_trunc_lock);
+                if (ip == NULL)
+                        return;
+                gfs2_glock_finish_truncate(ip);
+        }
+}
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_quotad(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        struct gfs2_tune *tune = &sdp->sd_tune;
+        unsigned long statfs_timeo = 0;
+        unsigned long quotad_timeo = 0;
+        unsigned long t = 0;
+        DEFINE_WAIT(wait);
+        int empty;
+        while (!kthread_should_stop()) {
+                /* Update the master statfs file */
+                quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+                                   &statfs_timeo, &tune->gt_statfs_quantum);
+                /* Update quota file */
+                quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+                                   &quotad_timeo, &tune->gt_quota_quantum);
+                /* FIXME: This should be turned into a shrinker */
+                gfs2_quota_scan(sdp);
+                /* Check for & recover partially truncated inodes */
+                quotad_check_trunc_list(sdp);
+                if (freezing(current))
+                        refrigerator();
+                t = min(quotad_timeo, statfs_timeo);
+                prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
+                spin_lock(&sdp->sd_trunc_lock);
+                empty = list_empty(&sdp->sd_trunc_list);
+                spin_unlock(&sdp->sd_trunc_lock);
+                if (empty)
+                        t -= schedule_timeout(t);
+                else
+                        t = 0;
+                finish_wait(&sdp->sd_quota_wait, &wait);
+        }
+        return 0;
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 3b7f4b0e5dfe..cec9032be97d 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,22 +15,22 @@ struct gfs2_sbd;
 #define NO_QUOTA_CHANGE ((u32)-1)
-int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unhold(struct gfs2_inode *ip);
+extern void gfs2_quota_unhold(struct gfs2_inode *ip);
-int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unlock(struct gfs2_inode *ip);
+extern void gfs2_quota_unlock(struct gfs2_inode *ip);
-int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
-                       u32 uid, u32 gid);
+                              u32 uid, u32 gid);
-int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
-int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
-int gfs2_quota_init(struct gfs2_sbd *sdp);
+extern int gfs2_quota_init(struct gfs2_sbd *sdp);
-void gfs2_quota_scan(struct gfs2_sbd *sdp);
+extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
-void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quotad(void *data);
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 {
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index d5e91f4f6a0b..efd09c3d2b26 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -583,13 +585,35 @@ fail:
        return error;
 }
+static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+        struct gfs2_jdesc *jd;
+        int found = 0;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (jd->jd_dirty) {
+                        jd->jd_dirty = 0;
+                        found = 1;
+                        break;
+                }
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+        if (!found)
+                jd = NULL;
+        return jd;
+}
 /**
 * gfs2_check_journals - Recover any dirty journals
 * @sdp: the filesystem
 *
 */
-void gfs2_check_journals(struct gfs2_sbd *sdp)
+static void gfs2_check_journals(struct gfs2_sbd *sdp)
 {
        struct gfs2_jdesc *jd;
@@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp)
        }
 }
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_recoverd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        while (!kthread_should_stop()) {
+                gfs2_check_journals(sdp);
+                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
+                if (freezing(current))
+                        refrigerator();
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index f7235e61c723..a8218ea15b57 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
                *blk = 0;
 }
-int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
                           struct buffer_head **bh);
-int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
-int gfs2_find_jhead(struct gfs2_jdesc *jd,
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
                    struct gfs2_log_header_host *head);
-int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-void gfs2_check_journals(struct gfs2_sbd *sdp);
+extern int gfs2_recoverd(void *data);
 #endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d90fb253505..8b01c635d925 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
                                                  bi->bi_len, x);
        }
-        if (count[0] != rgd->rd_rg.rg_free) {
+        if (count[0] != rgd->rd_free) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "free data mismatch:  %u != %u\n",
-                               count[0], rgd->rd_rg.rg_free);
+                               count[0], rgd->rd_free);
                return;
        }
-        tmp = rgd->rd_data -
+        tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
-                rgd->rd_rg.rg_free -
-                rgd->rd_rg.rg_dinodes;
        if (count[1] + count[2] != tmp) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used data mismatch:  %u != %u\n",
@@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
                return;
        }
-        if (count[3] != rgd->rd_rg.rg_dinodes) {
+        if (count[3] != rgd->rd_dinodes) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-                               count[3], rgd->rd_rg.rg_dinodes);
+                               count[3], rgd->rd_dinodes);
                return;
        }
@@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
-                if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size)
+                if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
        struct file_ra_state ra_state;
-        u64 rgrp_count = ip->i_di.di_size;
+        u64 rgrp_count = ip->i_disksize;
        int error;
        if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
@@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
                /* Ignore partials */
                if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                    ip->i_di.di_size)
+                    ip->i_disksize)
                        break;
                error = read_rindex_entry(ip, &ra_state);
                if (error) {
@@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 {
        const struct gfs2_rgrp *str = buf;
-        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
        u32 rg_flags;
        rg_flags = be32_to_cpu(str->rg_flags);
@@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
                rgd->rd_flags |= GFS2_RDF_NOALLOC;
        else
                rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
-        rg->rg_free = be32_to_cpu(str->rg_free);
+        rgd->rd_free = be32_to_cpu(str->rg_free);
-        rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
+        rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
-        rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+        rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
 }
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
        struct gfs2_rgrp *str = buf;
-        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
        u32 rg_flags = 0;
        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
                rg_flags |= GFS2_RGF_NOALLOC;
        str->rg_flags = cpu_to_be32(rg_flags);
-        str->rg_free = cpu_to_be32(rg->rg_free);
+        str->rg_free = cpu_to_be32(rgd->rd_free);
-        str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+        str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
        str->__pad = cpu_to_be32(0);
-        str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+        str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
        memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
 }
@@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
        }
        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        rgd->rd_free_clone = rgd->rd_free;
        rgd->rd_bh_count++;
        spin_unlock(&sdp->sd_rindex_spin);
@@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
        }
        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        rgd->rd_free_clone = rgd->rd_free;
        spin_unlock(&sdp->sd_rindex_spin);
 }
@@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
        block = rgd->rd_data0 + blk;
        ip->i_goal = block;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
+        gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
-        rgd->rd_rg.rg_free -= *n;
+        rgd->rd_free -= *n;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        block = rgd->rd_data0 + blk;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        gfs2_assert_withdraw(sdp, rgd->rd_free);
-        rgd->rd_rg.rg_free--;
+        rgd->rd_free--;
-        rgd->rd_rg.rg_dinodes++;
+        rgd->rd_dinodes++;
-        *generation = rgd->rd_rg.rg_igeneration++;
+        *generation = rgd->rd_igeneration++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        if (!rgd)
                return;
-        rgd->rd_rg.rg_free += blen;
+        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        if (!rgd)
                return;
-        rgd->rd_rg.rg_free += blen;
+        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
                return;
        gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
-        if (!rgd->rd_rg.rg_dinodes)
+        if (!rgd->rd_dinodes)
                gfs2_consist_rgrpd(rgd);
-        rgd->rd_rg.rg_dinodes--;
+        rgd->rd_dinodes--;
-        rgd->rd_rg.rg_free++;
+        rgd->rd_free++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c3ba3d9d0aac..141b781f2fcc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -34,76 +34,6 @@
 #include "util.h"
 /**
- * gfs2_jindex_hold - Grab a lock on the jindex
- * @sdp: The GFS2 superblock
- * @ji_gh: the holder for the jindex glock
- *
- * This is very similar to the gfs2_rindex_hold() function, except that
- * in general we hold the jindex lock for longer periods of time and
- * we grab it far less frequently (in general) then the rgrp lock.
- *
- * Returns: errno
- */
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
-{
-        struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
-        struct qstr name;
-        char buf[20];
-        struct gfs2_jdesc *jd;
-        int error;
-        name.name = buf;
-        mutex_lock(&sdp->sd_jindex_mutex);
-        for (;;) {
-                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
-                if (error)
-                        break;
-                name.len = sprintf(buf, "journal%u", sdp->sd_journals);
-                name.hash = gfs2_disk_hash(name.name, name.len);
-                error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
-                if (error == -ENOENT) {
-                        error = 0;
-                        break;
-                }
-                gfs2_glock_dq_uninit(ji_gh);
-                if (error)
-                        break;
-                error = -ENOMEM;
-                jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
-                if (!jd)
-                        break;
-                INIT_LIST_HEAD(&jd->extent_list);
-                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
-                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
-                        if (!jd->jd_inode)
-                                error = -ENOENT;
-                        else
-                                error = PTR_ERR(jd->jd_inode);
-                        kfree(jd);
-                        break;
-                }
-                spin_lock(&sdp->sd_jindex_spin);
-                jd->jd_jid = sdp->sd_journals++;
-                list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
-                spin_unlock(&sdp->sd_jindex_spin);
-        }
-        mutex_unlock(&sdp->sd_jindex_mutex);
-        return error;
-}
-/**
 * gfs2_jindex_free - Clear all the journal index information
 * @sdp: The GFS2 superblock
 *
@@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
        return jd;
 }
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
-{
-        struct gfs2_jdesc *jd;
-        spin_lock(&sdp->sd_jindex_spin);
-        jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
-        if (jd)
-                jd->jd_dirty = 1;
-        spin_unlock(&sdp->sd_jindex_spin);
-}
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
-{
-        struct gfs2_jdesc *jd;
-        int found = 0;
-        spin_lock(&sdp->sd_jindex_spin);
-        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-                if (jd->jd_dirty) {
-                        jd->jd_dirty = 0;
-                        found = 1;
-                        break;
-                }
-        }
-        spin_unlock(&sdp->sd_jindex_spin);
-        if (!found)
-                jd = NULL;
-        return jd;
-}
 int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
        int ar;
        int error;
-        if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
+        if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
-            (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+            (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
                gfs2_consist_inode(ip);
                return -EIO;
        }
-        jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
-        error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+        error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
        if (!error && ar) {
                gfs2_consist_inode(ip);
                error = -EIO;
@@ -423,137 +320,6 @@ out:
        return error;
 }
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-        spin_lock(&sdp->sd_statfs_spin);
-        *sc = *m_sc;
-        sc->sc_total += l_sc->sc_total;
-        sc->sc_free += l_sc->sc_free;
-        sc->sc_dinodes += l_sc->sc_dinodes;
-        spin_unlock(&sdp->sd_statfs_spin);
-        if (sc->sc_free < 0)
-                sc->sc_free = 0;
-        if (sc->sc_free > sc->sc_total)
-                sc->sc_free = sc->sc_total;
-        if (sc->sc_dinodes < 0)
-                sc->sc_dinodes = 0;
-        return 0;
-}
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-                            struct gfs2_statfs_change_host *sc)
-{
-        gfs2_rgrp_verify(rgd);
-        sc->sc_total += rgd->rd_data;
-        sc->sc_free += rgd->rd_rg.rg_free;
-        sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
-        return 0;
-}
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_holder ri_gh;
-        struct gfs2_rgrpd *rgd_next;
-        struct gfs2_holder *gha, *gh;
-        unsigned int slots = 64;
-        unsigned int x;
-        int done;
-        int error = 0, err;
-        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-        if (!gha)
-                return -ENOMEM;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                goto out;
-        rgd_next = gfs2_rgrpd_get_first(sdp);
-        for (;;) {
-                done = 1;
-                for (x = 0; x < slots; x++) {
-                        gh = gha + x;
-                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
-                                err = gfs2_glock_wait(gh);
-                                if (err) {
-                                        gfs2_holder_uninit(gh);
-                                        error = err;
-                                } else {
-                                        if (!error)
-                                                error = statfs_slow_fill(
-                                                        gh->gh_gl->gl_object, sc);
-                                        gfs2_glock_dq_uninit(gh);
-                                }
-                        }
-                        if (gh->gh_gl)
-                                done = 0;
-                        else if (rgd_next && !error) {
-                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
-                                                           LM_ST_SHARED,
-                                                           GL_ASYNC,
-                                                           gh);
-                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
-                                done = 0;
-                        }
-                        if (signal_pending(current))
-                                error = -ERESTARTSYS;
-                }
-                if (done)
-                        break;
-                yield();
-        }
-        gfs2_glock_dq_uninit(&ri_gh);
-out:
-        kfree(gha);
-        return error;
-}
 struct lfcc {
        struct list_head list;
        struct gfs2_holder gh;
@@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
        struct gfs2_log_header_host lh;
        int error;
-        error = gfs2_jindex_hold(sdp, &ji_gh);
-        if (error)
-                return error;
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
                if (!lfcc) {
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 50a4c9b1215e..f6b8b00ad881 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -10,6 +10,8 @@
 #ifndef __SUPER_DOT_H__
 #define __SUPER_DOT_H__
+#include <linux/fs.h>
+#include <linux/dcache.h>
 #include "incore.h"
 void gfs2_lm_unmount(struct gfs2_sbd *sdp);
@@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
        return x;
 }
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
 void gfs2_jindex_free(struct gfs2_sbd *sdp);
 struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
 int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
@@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
                        s64 total, s64 free, s64 dinodes);
 int gfs2_statfs_sync(struct gfs2_sbd *sdp);
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
 int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+extern const struct export_operations gfs2_export_ops;
+extern const struct super_operations gfs2_super_ops;
+extern struct dentry_operations gfs2_dops;
 #endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7e1879f1a02c..26c1fa777a95 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,9 +26,6 @@
 #include "quota.h"
 #include "util.h"
-char *gfs2_sys_margs;
-spinlock_t gfs2_sys_margs_lock;
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -263,7 +260,6 @@ ARGS_ATTR(localcaching,    "%d\n");
 ARGS_ATTR(localflocks,     "%d\n");
 ARGS_ATTR(debug,           "%d\n");
 ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(num_glockd,      "%u\n");
 ARGS_ATTR(posix_acl,       "%d\n");
 ARGS_ATTR(quota,           "%u\n");
 ARGS_ATTR(suiddir,         "%d\n");
@@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = {
        &args_attr_localflocks.attr,
        &args_attr_debug.attr,
        &args_attr_upgrade.attr,
-        &args_attr_num_glockd.attr,
        &args_attr_posix_acl.attr,
        &args_attr_quota.attr,
        &args_attr_suiddir.attr,
@@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = {
 };
 /*
- * display counters from superblock
- */
-struct counters_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-#define COUNTERS_ATTR(name, fmt)                                            \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-        return snprintf(buf, PAGE_SIZE, fmt,                                \
-                        (unsigned int)atomic_read(&sdp->sd_##name));        \
-}                                                                           \
-static struct counters_attr counters_attr_##name = __ATTR_RO(name)
-COUNTERS_ATTR(reclaimed,        "%u\n");
-static struct attribute *counters_attrs[] = {
-        &counters_attr_reclaimed.attr,
-        NULL,
-};
-/*
 * get and set struct gfs2_tune fields
 */
@@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-TUNE_ATTR(demote_secs, 0);
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
 TUNE_ATTR_DAEMON(logd_secs, logd_process);
-TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
-        &tune_attr_demote_secs.attr,
        &tune_attr_incore_log_blocks.attr,
        &tune_attr_log_flush_secs.attr,
        &tune_attr_quota_warn_period.attr,
@@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_statfs_quantum.attr,
        &tune_attr_recoverd_secs.attr,
        &tune_attr_logd_secs.attr,
-        &tune_attr_quotad_secs.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
        NULL,
@@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = {
        .attrs = lockstruct_attrs,
 };
-static struct attribute_group counters_group = {
-        .name = "counters",
-        .attrs = counters_attrs,
-};
 static struct attribute_group args_group = {
        .name = "args",
        .attrs = args_attrs,
@@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
        if (error)
                goto fail_reg;
-        error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
-        if (error)
-                goto fail_lockstruct;
        error = sysfs_create_group(&sdp->sd_kobj, &args_group);
        if (error)
-                goto fail_counters;
+                goto fail_lockstruct;
        error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
        if (error)
@@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 fail_args:
        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_counters:
-        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
 fail_lockstruct:
        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
@@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
        kobject_put(&sdp->sd_kobj);
 }
+static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
+                       struct kobj_uevent_env *env)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
+        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+        return 0;
+}
+static struct kset_uevent_ops gfs2_uevent_ops = {
+        .uevent = gfs2_uevent,
+};
 int gfs2_sys_init(void)
 {
-        gfs2_sys_margs = NULL;
+        gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
-        spin_lock_init(&gfs2_sys_margs_lock);
-        gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
        if (!gfs2_kset)
                return -ENOMEM;
        return 0;
@@ -515,7 +482,6 @@ int gfs2_sys_init(void)
 void gfs2_sys_uninit(void)
 {
-        kfree(gfs2_sys_margs);
        kset_unregister(gfs2_kset);
 }
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 1ca8cdac5304..e94560e836d7 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -13,10 +13,6 @@
 #include <linux/spinlock.h>
 struct gfs2_sbd;
-/* Allow args to be passed to GFS2 when using an initial ram disk */
-extern char *gfs2_sys_margs;
-extern spinlock_t gfs2_sys_margs_lock;
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d31e355c61fb..374f50e95496 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
+struct kmem_cache *gfs2_quotad_cachep __read_mostly;
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 7f48576289c9..33e96b0ce9ab 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
 extern struct kmem_cache *gfs2_rgrpd_cachep;
+extern struct kmem_cache *gfs2_quotad_cachep;
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
                                           unsigned int *p)
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
new file mode 100644
index 000000000000..b77c5bc20f8a
--- /dev/null
+++ b/fs/hfs/Kconfig
@@ -0,0 +1,12 @@
+config HFS_FS
+        tristate "Apple Macintosh file system support (EXPERIMENTAL)"
+        depends on BLOCK && EXPERIMENTAL
+        select NLS
+        help
+          If you say Y here, you will be able to mount Macintosh-formatted
+          floppy disks and hard drive partitions with full read-write access.
+          Please read <file:Documentation/filesystems/hfs.txt> to learn about
+          the available mount options.
+          To compile this file system support as a module, choose M here: the
+          module will be called hfs.
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
new file mode 100644
index 000000000000..a63371815aab
--- /dev/null
+++ b/fs/hfsplus/Kconfig
@@ -0,0 +1,13 @@
+config HFSPLUS_FS
+        tristate "Apple Extended HFS file system support"
+        depends on BLOCK
+        select NLS
+        select NLS_UTF8
+        help
+          If you say Y here, you will be able to mount extended format
+          Macintosh-formatted hard drive partitions with full read-write access.
+          This file system is often called HFS+ and was introduced with
+          MacOS 8. It includes all Mac specific filesystem data such as
+          data forks and creator codes, but it also has several UNIX
+          style features such as file ownership and permissions.
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a31451ac170..5c538e0ec14b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = __grab_cache_page(mapping, index);
+        *pagep = grab_cache_page_write_begin(mapping, index, flags);
        if (!*pagep)
                return -ENOMEM;
        return 0;
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
new file mode 100644
index 000000000000..56bd15c5bf6c
--- /dev/null
+++ b/fs/hpfs/Kconfig
@@ -0,0 +1,14 @@
+config HPFS_FS
+        tristate "OS/2 HPFS file system support"
+        depends on BLOCK
+        help
+          OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
+          is the file system used for organizing files on OS/2 hard disk
+          partitions. Say Y if you want to be able to read files from and
+          write files to an OS/2 HPFS partition on your hard drive. OS/2
+          floppies however are in regular MSDOS format, so you don't need this
+          option in order to be able to read them. Read
+          <file:Documentation/filesystems/hpfs.txt>.
+          To compile this file system support as a module, choose M here: the
+          module will be called hpfs.  If unsure, say N.
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3aceb..6903d37af037 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
        for (;;) {
                struct page *page;
                unsigned long nr, ret;
+                int ra;
                /* nr is the maximum number of bytes to copy from this page */
                nr = huge_page_size(h);
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
                         */
                        ret = len < nr ? len : nr;
                        if (clear_user(buf, ret))
-                                ret = -EFAULT;
+                                ra = -EFAULT;
+                        else
+                                ra = 0;
                } else {
                        /*
                         * We have the page, copy it to user space buffer.
                         */
-                        ret = hugetlbfs_read_actor(page, offset, buf, len, nr);
+                        ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
+                        ret = ra;
                }
-                if (ret < 0) {
+                if (ra < 0) {
                        if (retval == 0)
-                                retval = ret;
+                                retval = ra;
                        if (page)
                                page_cache_release(page);
                        goto out;
@@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                inode->i_mode = mode;
                inode->i_uid = uid;
                inode->i_gid = gid;
-                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index ed22b14f2202..40e37c026565 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -23,6 +23,7 @@
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
 #include <linux/mount.h>
+#include <linux/async.h>
 /*
 * This is needed for the following functions:
@@ -111,8 +112,8 @@ static void wake_up_inode(struct inode *inode)
 /**
 * inode_init_always - perform inode structure intialisation
- * @sb          - superblock inode belongs to.
+ * @sb: superblock inode belongs to
- * @inode       - inode to initialise
+ * @inode: inode to initialise
 *
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
@@ -132,6 +133,8 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_op = &empty_iops;
        inode->i_fop = &empty_fops;
        inode->i_nlink = 1;
+        inode->i_uid = 0;
+        inode->i_gid = 0;
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_blocks = 0;
@@ -165,7 +168,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
-        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->assoc_mapping = NULL;
        mapping->backing_dev_info = &default_backing_dev_info;
        mapping->writeback_index = 0;
@@ -584,8 +587,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
 /**
 * inode_add_to_lists - add a new inode to relevant lists
- * @sb          - superblock inode belongs to.
+ * @sb: superblock inode belongs to
- * @inode       - inode to mark in use
+ * @inode: inode to mark in use
 *
 * When an inode is allocated it needs to be accounted for, added to the in use
 * list, the owning superblock and the inode hash. This needs to be done under
@@ -609,7 +612,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
 *      @sb: superblock
 *
 *      Allocates a new inode for given superblock. The default gfp_mask
- *      for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
+ *      for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
 *      If HIGHMEM pages are unsuitable or it is known that pages allocated
 *      for the page cache are not reclaimable or migratable,
 *      mapping_set_gfp_mask() must be called with suitable flags on the
@@ -1042,6 +1045,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 EXPORT_SYMBOL(iget_locked);
+int insert_inode_locked(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        ino_t ino = inode->i_ino;
+        struct hlist_head *head = inode_hashtable + hash(sb, ino);
+        struct inode *old;
+        inode->i_state |= I_LOCK|I_NEW;
+        while (1) {
+                spin_lock(&inode_lock);
+                old = find_inode_fast(sb, head, ino);
+                if (likely(!old)) {
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode_lock);
+                        return 0;
+                }
+                __iget(old);
+                spin_unlock(&inode_lock);
+                wait_on_inode(old);
+                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                        iput(old);
+                        return -EBUSY;
+                }
+                iput(old);
+        }
+}
+EXPORT_SYMBOL(insert_inode_locked);
+int insert_inode_locked4(struct inode *inode, unsigned long hashval,
+                int (*test)(struct inode *, void *), void *data)
+{
+        struct super_block *sb = inode->i_sb;
+        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+        struct inode *old;
+        inode->i_state |= I_LOCK|I_NEW;
+        while (1) {
+                spin_lock(&inode_lock);
+                old = find_inode(sb, head, test, data);
+                if (likely(!old)) {
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode_lock);
+                        return 0;
+                }
+                __iget(old);
+                spin_unlock(&inode_lock);
+                wait_on_inode(old);
+                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                        iput(old);
+                        return -EBUSY;
+                }
+                iput(old);
+        }
+}
+EXPORT_SYMBOL(insert_inode_locked4);
 /**
 *      __insert_inode_hash - hash an inode
 *      @inode: unhashed inode
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 43e8b2c0664b..240ec63984cb 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 #define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
 #define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
-/*
+/**
+ * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
 * @inode - the inode to map
 * @arg - the pointer to userspace where we copy everything to
 * @get_block - the fs's get_block function
@@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 *
 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
 * please do not use this function, it will stop at the first unmapped block
- * beyond i_size
+ * beyond i_size.
+ *
+ * If you use this function directly, you need to do your own locking. Use
+ * generic_block_fiemap if you want the locking done for you.
 */
-int generic_block_fiemap(struct inode *inode,
-                         struct fiemap_extent_info *fieinfo, u64 start,
+int __generic_block_fiemap(struct inode *inode,
-                         u64 len, get_block_t *get_block)
+                           struct fiemap_extent_info *fieinfo, u64 start,
+                           u64 len, get_block_t *get_block)
 {
        struct buffer_head tmp;
        unsigned int start_blk;
@@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode,
        start_blk = logical_to_blk(inode, start);
-        /* guard against change */
-        mutex_lock(&inode->i_mutex);
        length = (long long)min_t(u64, len, i_size_read(inode));
        map_len = length;
@@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode,
                cond_resched();
        } while (1);
-        mutex_unlock(&inode->i_mutex);
        /* if ret is 1 then we just hit the end of the extent array */
        if (ret == 1)
                ret = 0;
        return ret;
 }
+EXPORT_SYMBOL(__generic_block_fiemap);
+/**
+ * generic_block_fiemap - FIEMAP for block based inodes
+ * @inode: The inode to map
+ * @fieinfo: The mapping information
+ * @start: The initial block to map
+ * @len: The length of the extect to attempt to map
+ * @get_block: The block mapping function for the fs
+ *
+ * Calls __generic_block_fiemap to map the inode, after taking
+ * the inode's mutex lock.
+ */
+int generic_block_fiemap(struct inode *inode,
+                         struct fiemap_extent_info *fieinfo, u64 start,
+                         u64 len, get_block_t *get_block)
+{
+        int ret;
+        mutex_lock(&inode->i_mutex);
+        ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 EXPORT_SYMBOL(generic_block_fiemap);
 #endif  /*  CONFIG_BLOCK  */
@@ -415,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
        return error;
 }
+static int ioctl_fsfreeze(struct file *filp)
+{
+        struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /* If filesystem doesn't support freeze feature, return. */
+        if (sb->s_op->freeze_fs == NULL)
+                return -EOPNOTSUPP;
+        /* If a blockdevice-backed filesystem isn't specified, return. */
+        if (sb->s_bdev == NULL)
+                return -EINVAL;
+        /* Freeze */
+        sb = freeze_bdev(sb->s_bdev);
+        if (IS_ERR(sb))
+                return PTR_ERR(sb);
+        return 0;
+}
+static int ioctl_fsthaw(struct file *filp)
+{
+        struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+        if (sb->s_bdev == NULL)
+                return -EINVAL;
+        /* Thaw */
+        return thaw_bdev(sb->s_bdev, sb);
+}
 /*
 * When you add any new common ioctls to the switches above and below
 * please update compat_sys_ioctl() too.
@@ -462,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                } else
                        error = -ENOTTY;
                break;
+        case FIFREEZE:
+                error = ioctl_fsfreeze(filp);
+                break;
+        case FITHAW:
+                error = ioctl_fsthaw(filp);
+                break;
        default:
                if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
                        error = file_ioctl(filp, cmd, arg);
@@ -472,7 +542,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
        return error;
 }
-asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {
        struct file *filp;
        int error = -EBADF;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a2..c7c0b28d7d21 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
-static int set_task_ioprio(struct task_struct *task, int ioprio)
+int set_task_ioprio(struct task_struct *task, int ioprio)
 {
        int err;
        struct io_context *ioc;
@@ -70,8 +70,9 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
        task_unlock(task);
        return err;
 }
+EXPORT_SYMBOL_GPL(set_task_ioprio);
-asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
+SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 {
        int class = IOPRIO_PRIO_CLASS(ioprio);
        int data = IOPRIO_PRIO_DATA(ioprio);
@@ -187,7 +188,7 @@ int ioprio_best(unsigned short aprio, unsigned short bprio)
                return aprio;
 }
-asmlinkage long sys_ioprio_get(int which, int who)
+SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 {
        struct task_struct *g, *p;
        struct user_struct *user;
@@ -251,4 +252,3 @@ asmlinkage long sys_ioprio_get(int which, int who)
        read_unlock(&tasklist_lock);
        return ret;
 }
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
new file mode 100644
index 000000000000..8ab9878e3671
--- /dev/null
+++ b/fs/isofs/Kconfig
@@ -0,0 +1,39 @@
+config ISO9660_FS
+        tristate "ISO 9660 CDROM file system support"
+        help
+          This is the standard file system used on CD-ROMs.  It was previously
+          known as "High Sierra File System" and is called "hsfs" on other
+          Unix systems.  The so-called Rock-Ridge extensions which allow for
+          long Unix filenames and symbolic links are also supported by this
+          driver.  If you have a CD-ROM drive and want to do more with it than
+          just listen to audio CDs and watch its LEDs, say Y (and read
+          <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
+          available from <http://www.tldp.org/docs.html#howto>), thereby
+          enlarging your kernel by about 27 KB; otherwise say N.
+          To compile this file system support as a module, choose M here: the
+          module will be called isofs.
+config JOLIET
+        bool "Microsoft Joliet CDROM extensions"
+        depends on ISO9660_FS
+        select NLS
+        help
+          Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
+          which allows for long filenames in unicode format (unicode is the
+          new 16 bit character code, successor to ASCII, which encodes the
+          characters of almost all languages of the world; see
+          <http://www.unicode.org/> for more information).  Say Y here if you
+          want to be able to read Joliet CD-ROMs under Linux.
+config ZISOFS
+        bool "Transparent decompression extension"
+        depends on ISO9660_FS
+        select ZLIB_INFLATE
+        help
+          This is a Linux-specific extension to RockRidge which lets you store
+          data in compressed form on a CD-ROM and have it transparently
+          decompressed when the CD-ROM is accessed.  See
+          <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
+          necessary to create such a filesystem.  Say Y here if you want to be
+          able to read such compressed CD-ROMs.
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505b..6147ec3643a0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
        }
        sbi->s_joliet_level = joliet_level;
-        /* check the root inode */
-        if (!inode->i_op)
-                goto out_bad_root;
        /* Make sure the root inode is a directory */
        if (!S_ISDIR(inode->i_mode)) {
                printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
        /*
         * Display error messages and free resources.
         */
-out_bad_root:
-        printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
 out_iput:
        iput(inode);
        goto out_no_inode;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
        int flags;
        int err;
        unsigned long blocknr;
+        ktime_t start_time;
+        u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+        start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
+        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+        /*
+         * weight the commit time higher than the average time so we don't
+         * react too strongly to vast changes in commit time
+         */
+        if (likely(journal->j_average_commit_time))
+                journal->j_average_commit_time = (commit_time*3 +
+                                journal->j_average_commit_time) / 4;
+        else
+                journal->j_average_commit_time = commit_time;
        spin_unlock(&journal->j_state_lock);
        if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..e6a117431277 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
 * @handle: transaction to add buffer modifications to
 * @bh:     bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
 *
 * Returns an error code or 0 on success.
 *
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int old_handle_count, err;
+        int err;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
         * on IO anyway.  Speeds up many-threaded, many-dir operations
         * by 30x or more...
         *
+         * We try and optimize the sleep time against what the underlying disk
+         * can do, instead of having a static sleep time.  This is usefull for
+         * the case where our storage is so fast that it is more optimal to go
+         * ahead and force a flush and wait for the transaction to be committed
+         * than it is to wait for an arbitrary amount of time for new writers to
+         * join the transaction.  We acheive this by measuring how long it takes
+         * to commit a transaction, and compare it with how long this
+         * transaction has been running, and if run time < commit time then we
+         * sleep for the delta and commit.  This greatly helps super fast disks
+         * that would see slowdowns as more threads started doing fsyncs.
+         *
         * But don't do this if this process was the most recent one to
         * perform a synchronous write.  We do this to detect the case where a
         * single process is doing a stream of sync writes.  No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+                u64 commit_time, trans_time;
                journal->j_last_sync_writer = pid;
-                do {
-                        old_handle_count = transaction->t_handle_count;
+                spin_lock(&journal->j_state_lock);
-                        schedule_timeout_uninterruptible(1);
+                commit_time = journal->j_average_commit_time;
-                } while (old_handle_count != transaction->t_handle_count);
+                spin_unlock(&journal->j_state_lock);
+                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                   transaction->t_start_time));
+                commit_time = min_t(u64, commit_time,
+                                    1000*jiffies_to_usecs(1));
+                if (trans_time < commit_time) {
+                        ktime_t expires = ktime_add_ns(ktime_get(),
+                                                       commit_time);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+                }
        }
        current->journal_info = NULL;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe920..17159cacbd9e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
        return ret;
 }
-#define NR_BATCH        64
 static void
-__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+__flush_batch(journal_t *journal, int *batch_count)
 {
        int i;
-        ll_rw_block(SWRITE, *batch_count, bhs);
+        ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
        for (i = 0; i < *batch_count; i++) {
-                struct buffer_head *bh = bhs[i];
+                struct buffer_head *bh = journal->j_chkpt_bhs[i];
                clear_buffer_jwrite(bh);
                BUFFER_TRACE(bh, "brelse");
                __brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
 */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
-                        struct buffer_head **bhs, int *batch_count,
+                            int *batch_count, transaction_t *transaction)
-                        transaction_t *transaction)
 {
        struct buffer_head *bh = jh2bh(jh);
        int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                get_bh(bh);
                J_ASSERT_BH(bh, !buffer_jwrite(bh));
                set_buffer_jwrite(bh);
-                bhs[*batch_count] = bh;
+                journal->j_chkpt_bhs[*batch_count] = bh;
                __buffer_relink_io(jh);
                jbd_unlock_bh_state(bh);
                transaction->t_chp_stats.cs_written++;
                (*batch_count)++;
-                if (*batch_count == NR_BATCH) {
+                if (*batch_count == JBD2_NR_BATCH) {
                        spin_unlock(&journal->j_list_lock);
-                        __flush_batch(journal, bhs, batch_count);
+                        __flush_batch(journal, batch_count);
                        ret = 1;
                }
        }
@@ -388,7 +385,6 @@ restart:
        if (journal->j_checkpoint_transactions == transaction &&
                        transaction->t_tid == this_tid) {
                int batch_count = 0;
-                struct buffer_head *bhs[NR_BATCH];
                struct journal_head *jh;
                int retry = 0, err;
@@ -402,7 +398,7 @@ restart:
                                retry = 1;
                                break;
                        }
-                        retry = __process_buffer(journal, jh, bhs, &batch_count,
+                        retry = __process_buffer(journal, jh, &batch_count,
                                                 transaction);
                        if (retry < 0 && !result)
                                result = retry;
@@ -419,7 +415,7 @@ restart:
                                spin_unlock(&journal->j_list_lock);
                                retry = 1;
                        }
-                        __flush_batch(journal, bhs, &batch_count);
+                        __flush_batch(journal, &batch_count);
                }
                if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
           safely remove this transaction from the log */
        __jbd2_journal_drop_transaction(journal, transaction);
+        kfree(transaction);
        /* Just in case anybody was waiting for more transactions to be
           checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(journal->j_running_transaction != transaction);
        jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
-        kfree(transaction);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a8..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
+#include <linux/bio.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
                set_buffer_ordered(bh);
                barrier_done = 1;
        }
-        ret = submit_bh(WRITE, bh);
+        ret = submit_bh(WRITE_SYNC, bh);
        if (barrier_done)
                clear_buffer_ordered(bh);
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
                lock_buffer(bh);
                set_buffer_uptodate(bh);
                clear_buffer_dirty(bh);
-                ret = submit_bh(WRITE, bh);
+                ret = submit_bh(WRITE_SYNC, bh);
        }
        *cbh = bh;
        return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
 * This function along with journal_submit_commit_record
 * allows to write the commit record asynchronously.
 */
-static int journal_wait_on_commit_record(struct buffer_head *bh)
+static int journal_wait_on_commit_record(journal_t *journal,
+                                         struct buffer_head *bh)
 {
        int ret = 0;
+retry:
        clear_buffer_dirty(bh);
        wait_on_buffer(bh);
+        if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
+                printk(KERN_WARNING
+                       "JBD2: wait_on_commit_record: sync failed on %s - "
+                       "disabling barriers\n", journal->j_devname);
+                spin_lock(&journal->j_state_lock);
+                journal->j_flags &= ~JBD2_BARRIER;
+                spin_unlock(&journal->j_state_lock);
+                lock_buffer(bh);
+                clear_buffer_dirty(bh);
+                set_buffer_uptodate(bh);
+                bh->b_end_io = journal_end_buffer_io_sync;
+                ret = submit_bh(WRITE_SYNC, bh);
+                if (ret) {
+                        unlock_buffer(bh);
+                        return ret;
+                }
+                goto retry;
+        }
        if (unlikely(!buffer_uptodate(bh)))
                ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int flags;
        int err;
        unsigned long long blocknr;
+        ktime_t start_time;
+        u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
        int space_left = 0;
        int first_tag = 0;
        int tag_flag;
-        int i;
+        int i, to_free = 0;
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+        start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
@@ -509,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                if (is_journal_aborted(journal)) {
                        clear_buffer_jbddirty(jh2bh(jh));
                        JBUFFER_TRACE(jh, "journal is aborting: refile");
+                        jbd2_buffer_abort_trigger(jh,
+                                                  jh->b_frozen_data ?
+                                                  jh->b_frozen_triggers :
+                                                  jh->b_triggers);
                        jbd2_journal_refile_buffer(journal, jh);
                        /* If that was the last one, we need to clean up
                         * any descriptor buffers which may have been
@@ -799,7 +829,7 @@ wait_for_iobuf:
                        __jbd2_journal_abort_hard(journal);
        }
        if (!err && !is_journal_aborted(journal))
-                err = journal_wait_on_commit_record(cbh);
+                err = journal_wait_on_commit_record(journal, cbh);
        if (err)
                jbd2_journal_abort(journal, err);
@@ -844,6 +874,9 @@ restart_loop:
                 * data.
                 *
                 * Otherwise, we can just throw away the frozen data now.
+                 *
+                 * We also know that the frozen data has already fired
+                 * its triggers if they exist, so we can clear that too.
                 */
                if (jh->b_committed_data) {
                        jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +884,12 @@ restart_loop:
                        if (jh->b_frozen_data) {
                                jh->b_committed_data = jh->b_frozen_data;
                                jh->b_frozen_data = NULL;
+                                jh->b_frozen_triggers = NULL;
                        }
                } else if (jh->b_frozen_data) {
                        jbd2_free(jh->b_frozen_data, bh->b_size);
                        jh->b_frozen_data = NULL;
+                        jh->b_frozen_triggers = NULL;
                }
                spin_lock(&journal->j_list_lock);
@@ -972,14 +1007,23 @@ restart_loop:
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
-        spin_unlock(&journal->j_state_lock);
+        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
-        if (journal->j_commit_callback)
+        /*
-                journal->j_commit_callback(journal, commit_transaction);
+         * weight the commit time higher than the average time so we don't
+         * react too strongly to vast changes in the commit time
+         */
+        if (likely(journal->j_average_commit_time))
+                journal->j_average_commit_time = (commit_time +
+                                journal->j_average_commit_time*3) / 4;
+        else
+                journal->j_average_commit_time = commit_time;
+        spin_unlock(&journal->j_state_lock);
        if (commit_transaction->t_checkpoint_list == NULL &&
            commit_transaction->t_checkpoint_io_list == NULL) {
                __jbd2_journal_drop_transaction(journal, commit_transaction);
+                to_free = 1;
        } else {
                if (journal->j_checkpoint_transactions == NULL) {
                        journal->j_checkpoint_transactions = commit_transaction;
@@ -998,11 +1042,16 @@ restart_loop:
        }
        spin_unlock(&journal->j_list_lock);
+        if (journal->j_commit_callback)
+                journal->j_commit_callback(journal, commit_transaction);
        trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-                   journal->j_devname, journal->j_commit_sequence,
+                   journal->j_devname, commit_transaction->t_tid,
                   journal->j_tail_sequence);
        jbd_debug(1, "JBD: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
+        if (to_free)
+                kfree(commit_transaction);
        wake_up(&journal->j_wait_done_commit);
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f8..eb343008eded 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -37,6 +37,7 @@
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/math64.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -50,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -65,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
 EXPORT_SYMBOL(jbd2_journal_check_used_features);
 EXPORT_SYMBOL(jbd2_journal_check_available_features);
 EXPORT_SYMBOL(jbd2_journal_set_features);
-EXPORT_SYMBOL(jbd2_journal_create);
 EXPORT_SYMBOL(jbd2_journal_load);
 EXPORT_SYMBOL(jbd2_journal_destroy);
 EXPORT_SYMBOL(jbd2_journal_abort);
@@ -131,8 +132,9 @@ static int kjournald2(void *arg)
        journal->j_task = current;
        wake_up(&journal->j_wait_done_commit);
-        printk(KERN_INFO "kjournald2 starting.  Commit interval %ld seconds\n",
+        printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
-                        journal->j_commit_interval / HZ);
+               "commit interval %ld seconds\n", current->pid,
+               journal->j_devname, journal->j_commit_interval / HZ);
        /*
         * And now, wait forever for commit wakeup events.
@@ -290,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
+        struct jbd2_buffer_trigger_type *triggers;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -314,13 +317,23 @@ repeat:
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
                new_offset = offset_in_page(jh_in->b_frozen_data);
+                triggers = jh_in->b_frozen_triggers;
        } else {
                new_page = jh2bh(jh_in)->b_page;
                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+                triggers = jh_in->b_triggers;
        }
        mapped_data = kmap_atomic(new_page, KM_USER0);
        /*
+         * Fire any commit trigger.  Do this before checking for escaping,
+         * as the trigger may modify the magic offset.  If a copy-out
+         * happens afterwards, it will have the correct data in the buffer.
+         */
+        jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+                                   triggers);
+        /*
         * Check for escaping
         */
        if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -352,6 +365,13 @@ repeat:
                new_page = virt_to_page(tmp);
                new_offset = offset_in_page(tmp);
                done_copy_out = 1;
+                /*
+                 * This isn't strictly necessary, as we're using frozen
+                 * data for the escaping, but it keeps consistency with
+                 * b_frozen_data usage.
+                 */
+                jh_in->b_frozen_triggers = jh_in->b_triggers;
        }
        /*
@@ -631,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
                return NULL;
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        if (!bh)
+                return NULL;
        lock_buffer(bh);
        memset(bh->b_data, 0, journal->j_blocksize);
        set_buffer_uptodate(bh);
@@ -824,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
            jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
        seq_printf(seq, "  %ums logging transaction\n",
            jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+        seq_printf(seq, "  %lluus average transaction commit time\n",
+                   div_u64(s->journal->j_average_commit_time, 1000));
        seq_printf(seq, "  %lu handles per transaction\n",
            s->stats->u.run.rs_handle_count / s->stats->ts_tid);
        seq_printf(seq, "  %lu blocks per transaction\n",
@@ -961,6 +985,8 @@ static journal_t * journal_init_common (void)
        spin_lock_init(&journal->j_state_lock);
        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+        journal->j_min_batch_time = 0;
+        journal->j_max_batch_time = 15000; /* 15ms */
        /* The journal is marked for error until we succeed with recovery! */
        journal->j_flags = JBD2_ABORT;
@@ -1016,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        /* journal descriptor can store up to n blocks -bzzz */
        journal->j_blocksize = blocksize;
+        jbd2_stats_proc_init(journal);
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
        journal->j_wbufsize = n;
        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-                kfree(journal);
+                goto out_err;
-                journal = NULL;
-                goto out;
        }
        journal->j_dev = bdev;
        journal->j_fs_dev = fs_dev;
@@ -1034,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        p = journal->j_devname;
        while ((p = strchr(p, '/')))
                *p = '!';
-        jbd2_stats_proc_init(journal);
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-        J_ASSERT(bh != NULL);
+        if (!bh) {
+                printk(KERN_ERR
+                       "%s: Cannot get buffer for journal superblock\n",
+                       __func__);
+                goto out_err;
+        }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
        return journal;
+out_err:
+        jbd2_stats_proc_exit(journal);
+        kfree(journal);
+        return NULL;
 }
 /**
@@ -1089,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-                jbd2_stats_proc_exit(journal);
+                goto out_err;
-                kfree(journal);
-                return NULL;
        }
        err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1099,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        if (err) {
                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
                       __func__);
-                jbd2_stats_proc_exit(journal);
+                goto out_err;
-                kfree(journal);
-                return NULL;
        }
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-        J_ASSERT(bh != NULL);
+        if (!bh) {
+                printk(KERN_ERR
+                       "%s: Cannot get buffer for journal superblock\n",
+                       __func__);
+                goto out_err;
+        }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
        return journal;
+out_err:
+        jbd2_stats_proc_exit(journal);
+        kfree(journal);
+        return NULL;
 }
 /*
@@ -1158,77 +1196,6 @@ static int journal_reset(journal_t *journal)
 }
 /**
- * int jbd2_journal_create() - Initialise the new journal file
- * @journal: Journal to create. This structure must have been initialised
- *
- * Given a journal_t structure which tells us which disk blocks we can
- * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.
- **/
-int jbd2_journal_create(journal_t *journal)
-{
-        unsigned long long blocknr;
-        struct buffer_head *bh;
-        journal_superblock_t *sb;
-        int i, err;
-        if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
-                printk (KERN_ERR "Journal length (%d blocks) too short.\n",
-                        journal->j_maxlen);
-                journal_fail_superblock(journal);
-                return -EINVAL;
-        }
-        if (journal->j_inode == NULL) {
-                /*
-                 * We don't know what block to start at!
-                 */
-                printk(KERN_EMERG
-                       "%s: creation of journal on external device!\n",
-                       __func__);
-                BUG();
-        }
-        /* Zero out the entire journal on disk.  We cannot afford to
-           have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
-        jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
-        for (i = 0; i < journal->j_maxlen; i++) {
-                err = jbd2_journal_bmap(journal, i, &blocknr);
-                if (err)
-                        return err;
-                bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-                lock_buffer(bh);
-                memset (bh->b_data, 0, journal->j_blocksize);
-                BUFFER_TRACE(bh, "marking dirty");
-                mark_buffer_dirty(bh);
-                BUFFER_TRACE(bh, "marking uptodate");
-                set_buffer_uptodate(bh);
-                unlock_buffer(bh);
-                __brelse(bh);
-        }
-        sync_blockdev(journal->j_dev);
-        jbd_debug(1, "JBD: journal cleared.\n");
-        /* OK, fill in the initial static fields in the new superblock */
-        sb = journal->j_superblock;
-        sb->s_header.h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
-        sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-        sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
-        sb->s_maxlen    = cpu_to_be32(journal->j_maxlen);
-        sb->s_first     = cpu_to_be32(1);
-        journal->j_transaction_sequence = 1;
-        journal->j_flags &= ~JBD2_ABORT;
-        journal->j_format_version = 2;
-        return journal_reset(journal);
-}
-/**
 * void jbd2_journal_update_superblock() - Update journal sb on disk.
 * @journal: The journal to update.
 * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1472,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
        spin_lock(&journal->j_list_lock);
        while (journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
+                mutex_lock(&journal->j_checkpoint_mutex);
                jbd2_log_do_checkpoint(journal);
+                mutex_unlock(&journal->j_checkpoint_mutex);
                spin_lock(&journal->j_list_lock);
        }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599a..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
@@ -741,6 +743,12 @@ done:
                source = kmap_atomic(page, KM_USER0);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
+                /*
+                 * Now that the frozen data is saved off, we need to store
+                 * any matching triggers.
+                 */
+                jh->b_frozen_triggers = jh->b_triggers;
        }
        jbd_unlock_bh_state(bh);
@@ -944,6 +952,47 @@ out:
 }
 /**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head.  This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+                               struct jbd2_buffer_trigger_type *type)
+{
+        struct journal_head *jh = bh2jh(bh);
+        jh->b_triggers = type;
+}
+void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+                                struct jbd2_buffer_trigger_type *triggers)
+{
+        struct buffer_head *bh = jh2bh(jh);
+        if (!triggers || !triggers->t_commit)
+                return;
+        triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+}
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+                               struct jbd2_buffer_trigger_type *triggers)
+{
+        if (!triggers || !triggers->t_abort)
+                return;
+        triggers->t_abort(triggers, jh2bh(jh));
+}
+/**
 * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
@@ -1193,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int old_handle_count, err;
+        int err;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
        /*
         * Implement synchronous transaction batching.  If the handle
         * was synchronous, don't force a commit immediately.  Let's
-         * yield and let another thread piggyback onto this transaction.
+         * yield and let another thread piggyback onto this
-         * Keep doing that while new threads continue to arrive.
+         * transaction.  Keep doing that while new threads continue to
-         * It doesn't cost much - we're about to run a commit and sleep
+         * arrive.  It doesn't cost much - we're about to run a commit
-         * on IO anyway.  Speeds up many-threaded, many-dir operations
+         * and sleep on IO anyway.  Speeds up many-threaded, many-dir
-         * by 30x or more...
+         * operations by 30x or more...
         *
-         * But don't do this if this process was the most recent one to
+         * We try and optimize the sleep time against what the
-         * perform a synchronous write.  We do this to detect the case where a
+         * underlying disk can do, instead of having a static sleep
-         * single process is doing a stream of sync writes.  No point in waiting
+         * time.  This is useful for the case where our storage is so
-         * for joiners in that case.
+         * fast that it is more optimal to go ahead and force a flush
+         * and wait for the transaction to be committed than it is to
+         * wait for an arbitrary amount of time for new writers to
+         * join the transaction.  We achieve this by measuring how
+         * long it takes to commit a transaction, and compare it with
+         * how long this transaction has been running, and if run time
+         * < commit time then we sleep for the delta and commit.  This
+         * greatly helps super fast disks that would see slowdowns as
+         * more threads started doing fsyncs.
+         *
+         * But don't do this if this process was the most recent one
+         * to perform a synchronous write.  We do this to detect the
+         * case where a single process is doing a stream of sync
+         * writes.  No point in waiting for joiners in that case.
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+                u64 commit_time, trans_time;
                journal->j_last_sync_writer = pid;
-                do {
-                        old_handle_count = transaction->t_handle_count;
+                spin_lock(&journal->j_state_lock);
-                        schedule_timeout_uninterruptible(1);
+                commit_time = journal->j_average_commit_time;
-                } while (old_handle_count != transaction->t_handle_count);
+                spin_unlock(&journal->j_state_lock);
+                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                   transaction->t_start_time));
+                commit_time = max_t(u64, commit_time,
+                                    1000*journal->j_min_batch_time);
+                commit_time = min_t(u64, commit_time,
+                                    1000*journal->j_max_batch_time);
+                if (trans_time < commit_time) {
+                        ktime_t expires = ktime_add_ns(ktime_get(),
+                                                       commit_time);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+                }
        }
        current->journal_info = NULL;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index c73fa89b5f8a..170d289ac785 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -22,9 +22,7 @@
 #define BIT_DIVIDER_MIPS 1043
-static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */
+static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
-#include <linux/errno.h>
 struct pushpull {
        unsigned char *buf;
@@ -43,7 +41,9 @@ struct rubin_state {
        int bits[8];
 };
-static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
+static inline void init_pushpull(struct pushpull *pp, char *buf,
+                                 unsigned buflen, unsigned ofs,
+                                 unsigned reserve)
 {
        pp->buf = buf;
        pp->buflen = buflen;
@@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen
 static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
 {
-        if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
+        if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
                return -ENOSPC;
-        }
-        if (bit) {
+        if (bit)
-                pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
+                pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
-        }
+        else
-        else {
+                pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
-                pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
-        }
        pp->ofs++;
        return 0;
@@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits)
        rs->p = (long) (2 * UPPER_BIT_RUBIN);
        rs->bit_number = (long) 0;
        rs->bit_divider = div;
        for (c=0; c<8; c++)
                rs->bits[c] = bits[c];
 }
@@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
        long i0, i1;
        int ret;
-        while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
+        while ((rs->q >= UPPER_BIT_RUBIN) ||
+               ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
                rs->bit_number++;
                ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
@@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
                rs->p <<= 1;
        }
        i0 = A * rs->p / (A + B);
-        if (i0 <= 0) {
+        if (i0 <= 0)
                i0 = 1;
-        }
-        if (i0 >= rs->p) {
+        if (i0 >= rs->p)
                i0 = rs->p - 1;
-        }
        i1 = rs->p - i0;
        if (symbol == 0)
@@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits)
        /* behalve lower */
        rs->rec_q = 0;
-        for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
+        for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
+             rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
                ;
 }
-static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q)
+static void __do_decode(struct rubin_state *rs, unsigned long p,
+                        unsigned long q)
 {
        register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
        unsigned long rec_q;
@@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B)
                __do_decode(rs, p, q);
        i0 = A * rs->p / (A + B);
-        if (i0 <= 0) {
+        if (i0 <= 0)
                i0 = 1;
-        }
-        if (i0 >= rs->p) {
+        if (i0 >= rs->p)
                i0 = rs->p - 1;
-        }
        threshold = rs->q + i0;
        symbol = rs->rec_q >= threshold;
@@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte)
        struct rubin_state rs_copy;
        rs_copy = *rs;
-        for (i=0;i<8;i++) {
+        for (i=0; i<8; i++) {
-                ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1);
+                ret = encode(rs, rs->bit_divider-rs->bits[i],
+                             rs->bits[i], byte & 1);
                if (ret) {
                        /* Failed. Restore old state */
                        *rs = rs_copy;
                        return ret;
                }
-                byte=byte>>1;
+                byte >>= 1 ;
        }
        return 0;
 }
@@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs)
        int i, result = 0, bit_divider = rs->bit_divider;
        for (i = 0; i < 8; i++)
-                result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i;
+                result |= decode(rs, bit_divider - rs->bits[i],
+                                 rs->bits[i]) << i;
        return result;
 }
@@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs)
 static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
-                      unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen)
+                             unsigned char *cpage_out, uint32_t *sourcelen,
+                             uint32_t *dstlen)
        {
        int outpos = 0;
        int pos=0;
@@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
 int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
                   uint32_t *sourcelen, uint32_t *dstlen, void *model)
 {
-        return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+        return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+                                 cpage_out, sourcelen, dstlen);
 }
 #endif
 static int jffs2_dynrubin_compress(unsigned char *data_in,
@@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
                return -1;
        memset(histo, 0, 256);
-        for (i=0; i<mysrclen; i++) {
+        for (i=0; i<mysrclen; i++)
                histo[data_in[i]]++;
-        }
        memset(bits, 0, sizeof(int)*8);
        for (i=0; i<256; i++) {
                if (i&128)
@@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
                cpage_out[i] = bits[i];
        }
-        ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen);
+        ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
+                                &mydstlen);
        if (ret)
                return ret;
@@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
        return 0;
 }
-static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in,
+static void rubin_do_decompress(int bit_divider, int *bits,
-                         unsigned char *page_out, uint32_t srclen, uint32_t destlen)
+                                unsigned char *cdata_in, 
+                                unsigned char *page_out, uint32_t srclen,
+                                uint32_t destlen)
 {
        int outpos = 0;
        struct rubin_state rs;
@@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata
        init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
        init_decode(&rs, bit_divider, bits);
-        while (outpos < destlen) {
+        while (outpos < destlen)
                page_out[outpos++] = in_byte(&rs);
-        }
 }
@@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
                                      uint32_t sourcelen, uint32_t dstlen,
                                      void *model)
 {
-        rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+        rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+                            cpage_out, sourcelen, dstlen);
        return 0;
 }
@@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in,
        for (c=0; c<8; c++)
                bits[c] = data_in[c];
-        rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen);
+        rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
+                            dstlen);
        return 0;
 }
 static struct jffs2_compressor jffs2_rubinmips_comp = {
-    .priority = JFFS2_RUBINMIPS_PRIORITY,
+        .priority = JFFS2_RUBINMIPS_PRIORITY,
-    .name = "rubinmips",
+        .name = "rubinmips",
-    .compr = JFFS2_COMPR_DYNRUBIN,
+        .compr = JFFS2_COMPR_DYNRUBIN,
-    .compress = NULL, /*&jffs2_rubinmips_compress,*/
+        .compress = NULL, /*&jffs2_rubinmips_compress,*/
-    .decompress = &jffs2_rubinmips_decompress,
+        .decompress = &jffs2_rubinmips_decompress,
 #ifdef JFFS2_RUBINMIPS_DISABLED
-    .disabled = 1,
+        .disabled = 1,
 #else
-    .disabled = 0,
+        .disabled = 0,
 #endif
 };
 int jffs2_rubinmips_init(void)
 {
-    return jffs2_register_compressor(&jffs2_rubinmips_comp);
+        return jffs2_register_compressor(&jffs2_rubinmips_comp);
 }
 void jffs2_rubinmips_exit(void)
 {
-    jffs2_unregister_compressor(&jffs2_rubinmips_comp);
+        jffs2_unregister_compressor(&jffs2_rubinmips_comp);
 }
 static struct jffs2_compressor jffs2_dynrubin_comp = {
-    .priority = JFFS2_DYNRUBIN_PRIORITY,
+        .priority = JFFS2_DYNRUBIN_PRIORITY,
-    .name = "dynrubin",
+        .name = "dynrubin",
-    .compr = JFFS2_COMPR_RUBINMIPS,
+        .compr = JFFS2_COMPR_RUBINMIPS,
-    .compress = jffs2_dynrubin_compress,
+        .compress = jffs2_dynrubin_compress,
-    .decompress = &jffs2_dynrubin_decompress,
+        .decompress = &jffs2_dynrubin_decompress,
 #ifdef JFFS2_DYNRUBIN_DISABLED
-    .disabled = 1,
+        .disabled = 1,
 #else
-    .disabled = 0,
+        .disabled = 0,
 #endif
 };
 int jffs2_dynrubin_init(void)
 {
-    return jffs2_register_compressor(&jffs2_dynrubin_comp);
+        return jffs2_register_compressor(&jffs2_dynrubin_comp);
 }
 void jffs2_dynrubin_exit(void)
 {
-    jffs2_unregister_compressor(&jffs2_dynrubin_comp);
+        jffs2_unregister_compressor(&jffs2_dynrubin_comp);
 }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 259461b910af..c32b4a1ad6cf 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 {
        /* For NAND, if the failure did not occur at the device level for a
           specific physical page, don't bother updating the bad block table. */
-        if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
+        if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
                /* We had a device-level failure to erase.  Let's see if we've
                   failed too many times. */
                if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr)
        struct erase_priv_struct *priv = (void *)instr->priv;
        if(instr->state != MTD_ERASE_DONE) {
-                printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state);
+                printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
+                        (unsigned long long)instr->addr, instr->state);
                jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
        } else {
                jffs2_erase_succeeded(priv->c, priv->jeb);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5a98aa87c853..5edc2bf20581 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
        uint32_t pageofs = index << PAGE_CACHE_SHIFT;
        int ret = 0;
-        pg = __grab_cache_page(mapping, index);
+        pg = grab_cache_page_write_begin(mapping, index, flags);
        if (!pg)
                return -ENOMEM;
        *pagep = pg;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 1750445556c3..507ed6ec1847 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c);
 void jffs2_free_raw_node_refs(struct jffs2_sb_info *c);
 struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset);
 void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
-struct rb_node *rb_next(struct rb_node *);
-struct rb_node *rb_prev(struct rb_node *);
-void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
 uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
 struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
new file mode 100644
index 000000000000..9ff619a6f9cc
--- /dev/null
+++ b/fs/jfs/Kconfig
@@ -0,0 +1,49 @@
+config JFS_FS
+        tristate "JFS filesystem support"
+        select NLS
+        help
+          This is a port of IBM's Journaled Filesystem .  More information is
+          available in the file <file:Documentation/filesystems/jfs.txt>.
+          If you do not intend to use the JFS filesystem, say N.
+config JFS_POSIX_ACL
+        bool "JFS POSIX Access Control Lists"
+        depends on JFS_FS
+        select FS_POSIX_ACL
+        help
+          Posix Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the Posix ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N
+config JFS_SECURITY
+        bool "JFS Security Labels"
+        depends on JFS_FS
+        help
+          Security labels support alternative access control models
+          implemented by security modules like SELinux.  This option
+          enables an extended attribute handler for file security
+          labels in the jfs filesystem.
+          If you are not using a security module that requires using
+          extended attributes for file security labels, say N.
+config JFS_DEBUG
+        bool "JFS debugging"
+        depends on JFS_FS
+        help
+          If you are experiencing any problems with the JFS filesystem, say
+          Y here.  This will result in additional debugging messages to be
+          written to the system log.  Under normal circumstances, this
+          results in very little overhead.
+config JFS_STATISTICS
+        bool "JFS statistics"
+        depends on JFS_FS
+        help
+          Enabling this option will cause statistics from the JFS file system
+          to be made available to the user in the /proc/fs/jfs/ directory.
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d0..0f94381ca6d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
 /*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't want
- * special inodes in the fileset inode space, we hash them to a dummy head
+ * special inodes in the fileset inode space, we make them appear hashed,
+ * but do not put on any lists.
 */
-static HLIST_HEAD(aggregate_hash);
 /*
 * imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
        /* release the page */
        release_metapage(mp);
-        hlist_add_head(&ip->i_hash, &aggregate_hash);
+        /*
+         * that will look hashed, but won't be on any list; hlist_del()
+         * will work fine and require no locking.
+         */
+        ip->i_hash.pprev = &ip->i_hash.next;
        return (ip);
 }
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 70022fd1c539..d4d142c2edd4 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        inode = new_inode(sb);
        if (!inode) {
                jfs_warn("ialloc: new_inode returned NULL!");
-                return ERR_PTR(-ENOMEM);
+                rc = -ENOMEM;
+                goto fail;
        }
        jfs_inode = JFS_IP(inode);
@@ -89,8 +90,12 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
                jfs_warn("ialloc: diAlloc returned %d!", rc);
                if (rc == -EIO)
                        make_bad_inode(inode);
-                iput(inode);
+                goto fail_put;
-                return ERR_PTR(rc);
+        }
+        if (insert_inode_locked(inode) < 0) {
+                rc = -EINVAL;
+                goto fail_unlock;
        }
        inode->i_uid = current_fsuid();
@@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
         * Allocate inode to quota.
         */
        if (DQUOT_ALLOC_INODE(inode)) {
-                DQUOT_DROP(inode);
+                rc = -EDQUOT;
-                inode->i_flags |= S_NOQUOTA;
+                goto fail_drop;
-                inode->i_nlink = 0;
-                iput(inode);
-                return ERR_PTR(-EDQUOT);
        }
        inode->i_mode = mode;
@@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        jfs_info("ialloc returns inode = 0x%p\n", inode);
        return inode;
+fail_drop:
+        DQUOT_DROP(inode);
+        inode->i_flags |= S_NOQUOTA;
+fail_unlock:
+        inode->i_nlink = 0;
+        unlock_new_inode(inode);
+fail_put:
+        iput(inode);
+fail:
+        return ERR_PTR(rc);
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index cc3cedffbfa1..b4de56b851e4 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        ip->i_fop = &jfs_file_operations;
        ip->i_mapping->a_ops = &jfs_aops;
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out2:
        free_UCSname(&dname);
@@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        ip->i_op = &jfs_dir_inode_operations;
        ip->i_fop = &jfs_dir_operations;
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        /* update parent directory inode */
@@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out2:
        free_UCSname(&dname);
@@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
                goto out3;
        }
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out2:
        free_UCSname(&dname);
@@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        jfs_ip->dev = new_encode_dev(rdev);
        init_special_inode(ip, ip->i_mode, rdev);
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out1:
        free_UCSname(&dname);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0dae345e481b..b37d1f78b854 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -543,7 +543,7 @@ out_kfree:
        return ret;
 }
-static void jfs_write_super_lockfs(struct super_block *sb)
+static int jfs_freeze(struct super_block *sb)
 {
        struct jfs_sb_info *sbi = JFS_SBI(sb);
        struct jfs_log *log = sbi->log;
@@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb)
                lmLogShutdown(log);
                updateSuper(sb, FM_CLEAN);
        }
+        return 0;
 }
-static void jfs_unlockfs(struct super_block *sb)
+static int jfs_unfreeze(struct super_block *sb)
 {
        struct jfs_sb_info *sbi = JFS_SBI(sb);
        struct jfs_log *log = sbi->log;
@@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb)
                else
                        txResume(sb);
        }
+        return 0;
 }
 static int jfs_get_sb(struct file_system_type *fs_type,
@@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = {
        .delete_inode   = jfs_delete_inode,
        .put_super      = jfs_put_super,
        .sync_fs        = jfs_sync_fs,
-        .write_super_lockfs = jfs_write_super_lockfs,
+        .freeze_fs      = jfs_freeze,
-        .unlockfs       = jfs_unlockfs,
+        .unfreeze_fs    = jfs_unfreeze,
        .statfs         = jfs_statfs,
        .remount_fs     = jfs_remount,
        .show_options   = jfs_show_options,
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a8321902..49b44099dabb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
         */
        root->i_ino = 1;
        root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-        root->i_uid = root->i_gid = 0;
        root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
        dentry = d_alloc(NULL, &d_name);
        if (!dentry) {
@@ -360,7 +359,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
         */
        inode->i_ino = 1;
        inode->i_mode = S_IFDIR | 0755;
-        inode->i_uid = inode->i_gid = 0;
-        inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
                if (!inode)
                        goto out;
                inode->i_mode = S_IFREG | files->mode;
-                inode->i_uid = inode->i_gid = 0;
-                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_fop = files->ops;
                inode->i_ino = i;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 31668b690e03..dd7957064a8c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
 #define NLMCLNT_GRACE_WAIT      (5*HZ)
@@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        unsigned char fl_type;
        int status = -ENOLCK;
-        if (nsm_monitor(host) < 0) {
+        if (nsm_monitor(host) < 0)
-                printk(KERN_NOTICE "lockd: failed to monitor %s\n",
-                                        host->h_name);
                goto out;
-        }
        fl->fl_flags |= FL_ACCESS;
        status = do_vfs_lock(fl);
        fl->fl_flags = fl_flags;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index abdebf76b820..99d737bd4325 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -15,7 +15,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/mutex.h>
 #include <net/ipv6.h>
@@ -32,11 +31,6 @@ static int			nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
-static struct nsm_handle        *nsm_find(const struct sockaddr *sap,
-                                                const size_t salen,
-                                                const char *hostname,
-                                                const size_t hostname_len,
-                                                const int create);
 struct nlm_lookup_host_info {
        const int               server;         /* search for server|client */
@@ -105,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap)
        }
 }
-static void nlm_display_address(const struct sockaddr *sap,
-                                char *buf, const size_t len)
-{
-        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-        switch (sap->sa_family) {
-        case AF_UNSPEC:
-                snprintf(buf, len, "unspecified");
-                break;
-        case AF_INET:
-                snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-                break;
-        case AF_INET6:
-                if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-                        snprintf(buf, len, "%pI4",
-                                 &sin6->sin6_addr.s6_addr32[3]);
-                else
-                        snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-                break;
-        default:
-                snprintf(buf, len, "unsupported address family");
-                break;
-        }
-}
 /*
 * Common host lookup routine for server & client
 */
@@ -190,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
                atomic_inc(&nsm->sm_count);
        else {
                host = NULL;
-                nsm = nsm_find(ni->sap, ni->salen,
+                nsm = nsm_get_handle(ni->sap, ni->salen,
-                                ni->hostname, ni->hostname_len, 1);
+                                        ni->hostname, ni->hostname_len);
                if (!nsm) {
                        dprintk("lockd: nlm_lookup_host failed; "
                                "no nsm handle\n");
@@ -206,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
                goto out;
        }
        host->h_name       = nsm->sm_name;
+        host->h_addrbuf    = nsm->sm_addrbuf;
        memcpy(nlm_addr(host), ni->sap, ni->salen);
        host->h_addrlen = ni->salen;
        nlm_clear_port(nlm_addr(host));
@@ -232,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
        nrhosts++;
-        nlm_display_address((struct sockaddr *)&host->h_addr,
-                                host->h_addrbuf, sizeof(host->h_addrbuf));
-        nlm_display_address((struct sockaddr *)&host->h_srcaddr,
-                                host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
        dprintk("lockd: nlm_lookup_host created host %s\n",
                        host->h_name);
@@ -256,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host)
        BUG_ON(!list_empty(&host->h_lockowners));
        BUG_ON(atomic_read(&host->h_count));
-        /*
-         * Release NSM handle and unmonitor host.
-         */
        nsm_unmonitor(host);
+        nsm_release(host->h_nsmhandle);
        clnt = host->h_rpcclnt;
        if (clnt != NULL)
@@ -378,8 +340,8 @@ nlm_bind_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
-        dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
+        dprintk("lockd: nlm_bind_host %s (%s)\n",
-                        host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
+                        host->h_name, host->h_addrbuf);
        /* Lock host handle */
        mutex_lock(&host->h_mutex);
@@ -481,35 +443,23 @@ void nlm_release_host(struct nlm_host *host)
        }
 }
-/*
+/**
- * We were notified that the host indicated by address &sin
+ * nlm_host_rebooted - Release all resources held by rebooted host
- * has rebooted.
+ * @info: pointer to decoded results of NLM_SM_NOTIFY call
- * Release all resources held by that peer.
+ *
+ * We were notified that the specified host has rebooted.  Release
+ * all resources held by that peer.
 */
-void nlm_host_rebooted(const struct sockaddr_in *sin,
+void nlm_host_rebooted(const struct nlm_reboot *info)
-                                const char *hostname,
-                                unsigned int hostname_len,
-                                u32 new_state)
 {
        struct hlist_head *chain;
        struct hlist_node *pos;
        struct nsm_handle *nsm;
        struct nlm_host *host;
-        nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
+        nsm = nsm_reboot_lookup(info);
-                        hostname, hostname_len, 0);
+        if (unlikely(nsm == NULL))
-        if (nsm == NULL) {
-                dprintk("lockd: never saw rebooted peer '%.*s' before\n",
-                                hostname_len, hostname);
                return;
-        }
-        dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
-                        hostname_len, hostname, nsm->sm_addrbuf);
-        /* When reclaiming locks on this peer, make sure that
-         * we set up a new notification */
-        nsm->sm_monitored = 0;
        /* Mark all hosts tied to this NSM state as having rebooted.
         * We run the loop repeatedly, because we drop the host table
@@ -520,8 +470,8 @@ again:	mutex_lock(&nlm_host_mutex);
        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
                hlist_for_each_entry(host, pos, chain, h_hash) {
                        if (host->h_nsmhandle == nsm
-                         && host->h_nsmstate != new_state) {
+                         && host->h_nsmstate != info->state) {
-                                host->h_nsmstate = new_state;
+                                host->h_nsmstate = info->state;
                                host->h_state++;
                                nlm_get_host(host);
@@ -629,89 +579,3 @@ nlm_gc_hosts(void)
        next_gc = jiffies + NLM_HOST_COLLECT;
 }
-/*
- * Manage NSM handles
- */
-static LIST_HEAD(nsm_handles);
-static DEFINE_SPINLOCK(nsm_lock);
-static struct nsm_handle *nsm_find(const struct sockaddr *sap,
-                                   const size_t salen,
-                                   const char *hostname,
-                                   const size_t hostname_len,
-                                   const int create)
-{
-        struct nsm_handle *nsm = NULL;
-        struct nsm_handle *pos;
-        if (!sap)
-                return NULL;
-        if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
-                if (printk_ratelimit()) {
-                        printk(KERN_WARNING "Invalid hostname \"%.*s\" "
-                                            "in NFS lock request\n",
-                                (int)hostname_len, hostname);
-                }
-                return NULL;
-        }
-retry:
-        spin_lock(&nsm_lock);
-        list_for_each_entry(pos, &nsm_handles, sm_link) {
-                if (hostname && nsm_use_hostnames) {
-                        if (strlen(pos->sm_name) != hostname_len
-                         || memcmp(pos->sm_name, hostname, hostname_len))
-                                continue;
-                } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
-                        continue;
-                atomic_inc(&pos->sm_count);
-                kfree(nsm);
-                nsm = pos;
-                goto found;
-        }
-        if (nsm) {
-                list_add(&nsm->sm_link, &nsm_handles);
-                goto found;
-        }
-        spin_unlock(&nsm_lock);
-        if (!create)
-                return NULL;
-        nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
-        if (nsm == NULL)
-                return NULL;
-        memcpy(nsm_addr(nsm), sap, salen);
-        nsm->sm_addrlen = salen;
-        nsm->sm_name = (char *) (nsm + 1);
-        memcpy(nsm->sm_name, hostname, hostname_len);
-        nsm->sm_name[hostname_len] = '\0';
-        nlm_display_address((struct sockaddr *)&nsm->sm_addr,
-                                nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
-        atomic_set(&nsm->sm_count, 1);
-        goto retry;
-found:
-        spin_unlock(&nsm_lock);
-        return nsm;
-}
-/*
- * Release an NSM handle
- */
-void
-nsm_release(struct nsm_handle *nsm)
-{
-        if (!nsm)
-                return;
-        if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
-                list_del(&nsm->sm_link);
-                spin_unlock(&nsm_lock);
-                kfree(nsm);
-        }
-}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index ffd3461f75ef..5e2c4d5ac827 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -9,35 +9,123 @@
 #include <linux/types.h>
 #include <linux/utsname.h>
 #include <linux/kernel.h>
+#include <linux/ktime.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_MONITOR
+#define NSM_PROGRAM             100024
+#define NSM_VERSION             1
+enum {
+        NSMPROC_NULL,
+        NSMPROC_STAT,
+        NSMPROC_MON,
+        NSMPROC_UNMON,
+        NSMPROC_UNMON_ALL,
+        NSMPROC_SIMU_CRASH,
+        NSMPROC_NOTIFY,
+};
+struct nsm_args {
+        struct nsm_private      *priv;
+        u32                     prog;           /* RPC callback info */
+        u32                     vers;
+        u32                     proc;
-#define XDR_ADDRBUF_LEN         (20)
+        char                    *mon_name;
+};
-static struct rpc_clnt *        nsm_create(void);
+struct nsm_res {
+        u32                     status;
+        u32                     state;
+};
 static struct rpc_program       nsm_program;
+static                          LIST_HEAD(nsm_handles);
+static                          DEFINE_SPINLOCK(nsm_lock);
 /*
 * Local NSM state
 */
-int                             nsm_local_state;
+int     __read_mostly           nsm_local_state;
+int     __read_mostly           nsm_use_hostnames;
-/*
+static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
- * Common procedure for SM_MON/SM_UNMON calls
+{
- */
+        return (struct sockaddr *)&nsm->sm_addr;
-static int
+}
-nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
+static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
+                                     const size_t len)
+{
+        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+        snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
+}
+static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
+                                     const size_t len)
+{
+        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+        if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+                snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
+        else if (sin6->sin6_scope_id != 0)
+                snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
+                                sin6->sin6_scope_id);
+        else
+                snprintf(buf, len, "%pI6", &sin6->sin6_addr);
+}
+static void nsm_display_address(const struct sockaddr *sap,
+                                char *buf, const size_t len)
+{
+        switch (sap->sa_family) {
+        case AF_INET:
+                nsm_display_ipv4_address(sap, buf, len);
+                break;
+        case AF_INET6:
+                nsm_display_ipv6_address(sap, buf, len);
+                break;
+        default:
+                snprintf(buf, len, "unsupported address family");
+                break;
+        }
+}
+static struct rpc_clnt *nsm_create(void)
+{
+        struct sockaddr_in sin = {
+                .sin_family             = AF_INET,
+                .sin_addr.s_addr        = htonl(INADDR_LOOPBACK),
+        };
+        struct rpc_create_args args = {
+                .protocol               = XPRT_TRANSPORT_UDP,
+                .address                = (struct sockaddr *)&sin,
+                .addrsize               = sizeof(sin),
+                .servername             = "rpc.statd",
+                .program                = &nsm_program,
+                .version                = NSM_VERSION,
+                .authflavor             = RPC_AUTH_NULL,
+        };
+        return rpc_create(&args);
+}
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 {
        struct rpc_clnt *clnt;
        int             status;
-        struct nsm_args args;
+        struct nsm_args args = {
+                .priv           = &nsm->sm_priv,
+                .prog           = NLM_PROGRAM,
+                .vers           = 3,
+                .proc           = NLMPROC_NSM_NOTIFY,
+                .mon_name       = nsm->sm_mon_name,
+        };
        struct rpc_message msg = {
                .rpc_argp       = &args,
                .rpc_resp       = res,
@@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
        clnt = nsm_create();
        if (IS_ERR(clnt)) {
                status = PTR_ERR(clnt);
+                dprintk("lockd: failed to create NSM upcall transport, "
+                                "status=%d\n", status);
                goto out;
        }
-        memset(&args, 0, sizeof(args));
-        args.mon_name = nsm->sm_name;
-        args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
-        args.prog = NLM_PROGRAM;
-        args.vers = 3;
-        args.proc = NLMPROC_NSM_NOTIFY;
        memset(res, 0, sizeof(*res));
        msg.rpc_proc = &clnt->cl_procinfo[proc];
        status = rpc_call_sync(clnt, &msg, 0);
        if (status < 0)
-                printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n",
+                dprintk("lockd: NSM upcall RPC failed, status=%d\n",
-                        status);
+                                status);
        else
                status = 0;
        rpc_shutdown_client(clnt);
@@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
        return status;
 }
-/*
+/**
- * Set up monitoring of a remote host
+ * nsm_monitor - Notify a peer in case we reboot
+ * @host: pointer to nlm_host of peer to notify
+ *
+ * If this peer is not already monitored, this function sends an
+ * upcall to the local rpc.statd to record the name/address of
+ * the peer to notify in case we reboot.
+ *
+ * Returns zero if the peer is monitored by the local rpc.statd;
+ * otherwise a negative errno value is returned.
 */
-int
+int nsm_monitor(const struct nlm_host *host)
-nsm_monitor(struct nlm_host *host)
 {
        struct nsm_handle *nsm = host->h_nsmhandle;
        struct nsm_res  res;
        int             status;
-        dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
+        dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
-        BUG_ON(nsm == NULL);
        if (nsm->sm_monitored)
                return 0;
-        status = nsm_mon_unmon(nsm, SM_MON, &res);
+        /*
+         * Choose whether to record the caller_name or IP address of
+         * this peer in the local rpc.statd's database.
+         */
+        nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
-        if (status < 0 || res.status != 0)
+        status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
-                printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
+        if (res.status != 0)
+                status = -EIO;
+        if (status < 0)
+                printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
        else
                nsm->sm_monitored = 1;
        return status;
 }
-/*
+/**
- * Cease to monitor remote host
+ * nsm_unmonitor - Unregister peer notification
+ * @host: pointer to nlm_host of peer to stop monitoring
+ *
+ * If this peer is monitored, this function sends an upcall to
+ * tell the local rpc.statd not to send this peer a notification
+ * when we reboot.
 */
-int
+void nsm_unmonitor(const struct nlm_host *host)
-nsm_unmonitor(struct nlm_host *host)
 {
        struct nsm_handle *nsm = host->h_nsmhandle;
        struct nsm_res  res;
-        int             status = 0;
+        int status;
-        if (nsm == NULL)
-                return 0;
-        host->h_nsmhandle = NULL;
        if (atomic_read(&nsm->sm_count) == 1
         && nsm->sm_monitored && !nsm->sm_sticky) {
-                dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+                dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
-                status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+                status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
+                if (res.status != 0)
+                        status = -EIO;
                if (status < 0)
                        printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
-                                        host->h_name);
+                                        nsm->sm_name);
                else
                        nsm->sm_monitored = 0;
        }
-        nsm_release(nsm);
+}
-        return status;
+static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
+                                              const size_t len)
+{
+        struct nsm_handle *nsm;
+        list_for_each_entry(nsm, &nsm_handles, sm_link)
+                if (strlen(nsm->sm_name) == len &&
+                    memcmp(nsm->sm_name, hostname, len) == 0)
+                        return nsm;
+        return NULL;
+}
+static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+{
+        struct nsm_handle *nsm;
+        list_for_each_entry(nsm, &nsm_handles, sm_link)
+                if (nlm_cmp_addr(nsm_addr(nsm), sap))
+                        return nsm;
+        return NULL;
+}
+static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+{
+        struct nsm_handle *nsm;
+        list_for_each_entry(nsm, &nsm_handles, sm_link)
+                if (memcmp(nsm->sm_priv.data, priv->data,
+                                        sizeof(priv->data)) == 0)
+                        return nsm;
+        return NULL;
 }
 /*
- * Create NSM client for the local host
+ * Construct a unique cookie to match this nsm_handle to this monitored
+ * host.  It is passed to the local rpc.statd via NSMPROC_MON, and
+ * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
+ * requests.
+ *
+ * The NSM protocol requires that these cookies be unique while the
+ * system is running.  We prefer a stronger requirement of making them
+ * unique across reboots.  If user space bugs cause a stale cookie to
+ * be sent to the kernel, it could cause the wrong host to lose its
+ * lock state if cookies were not unique across reboots.
+ *
+ * The cookies are exposed only to local user space via loopback.  They
+ * do not appear on the physical network.  If we want greater security
+ * for some reason, nsm_init_private() could perform a one-way hash to
+ * obscure the contents of the cookie.
 */
-static struct rpc_clnt *
+static void nsm_init_private(struct nsm_handle *nsm)
-nsm_create(void)
 {
-        struct sockaddr_in      sin = {
+        u64 *p = (u64 *)&nsm->sm_priv.data;
-                .sin_family     = AF_INET,
+        struct timespec ts;
-                .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
-                .sin_port       = 0,
-        };
-        struct rpc_create_args args = {
-                .protocol       = XPRT_TRANSPORT_UDP,
-                .address        = (struct sockaddr *)&sin,
-                .addrsize       = sizeof(sin),
-                .servername     = "localhost",
-                .program        = &nsm_program,
-                .version        = SM_VERSION,
-                .authflavor     = RPC_AUTH_NULL,
-        };
-        return rpc_create(&args);
+        ktime_get_ts(&ts);
+        *p++ = timespec_to_ns(&ts);
+        *p = (unsigned long)nsm;
+}
+static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
+                                            const size_t salen,
+                                            const char *hostname,
+                                            const size_t hostname_len)
+{
+        struct nsm_handle *new;
+        new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
+        if (unlikely(new == NULL))
+                return NULL;
+        atomic_set(&new->sm_count, 1);
+        new->sm_name = (char *)(new + 1);
+        memcpy(nsm_addr(new), sap, salen);
+        new->sm_addrlen = salen;
+        nsm_init_private(new);
+        nsm_display_address((const struct sockaddr *)&new->sm_addr,
+                                new->sm_addrbuf, sizeof(new->sm_addrbuf));
+        memcpy(new->sm_name, hostname, hostname_len);
+        new->sm_name[hostname_len] = '\0';
+        return new;
+}
+/**
+ * nsm_get_handle - Find or create a cached nsm_handle
+ * @sap: pointer to socket address of handle to find
+ * @salen: length of socket address
+ * @hostname: pointer to C string containing hostname to find
+ * @hostname_len: length of C string
+ *
+ * Behavior is modulated by the global nsm_use_hostnames variable.
+ *
+ * Returns a cached nsm_handle after bumping its ref count, or
+ * returns a fresh nsm_handle if a handle that matches @sap and/or
+ * @hostname cannot be found in the handle cache.  Returns NULL if
+ * an error occurs.
+ */
+struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+                                  const size_t salen, const char *hostname,
+                                  const size_t hostname_len)
+{
+        struct nsm_handle *cached, *new = NULL;
+        if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+                if (printk_ratelimit()) {
+                        printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+                                            "in NFS lock request\n",
+                                (int)hostname_len, hostname);
+                }
+                return NULL;
+        }
+retry:
+        spin_lock(&nsm_lock);
+        if (nsm_use_hostnames && hostname != NULL)
+                cached = nsm_lookup_hostname(hostname, hostname_len);
+        else
+                cached = nsm_lookup_addr(sap);
+        if (cached != NULL) {
+                atomic_inc(&cached->sm_count);
+                spin_unlock(&nsm_lock);
+                kfree(new);
+                dprintk("lockd: found nsm_handle for %s (%s), "
+                                "cnt %d\n", cached->sm_name,
+                                cached->sm_addrbuf,
+                                atomic_read(&cached->sm_count));
+                return cached;
+        }
+        if (new != NULL) {
+                list_add(&new->sm_link, &nsm_handles);
+                spin_unlock(&nsm_lock);
+                dprintk("lockd: created nsm_handle for %s (%s)\n",
+                                new->sm_name, new->sm_addrbuf);
+                return new;
+        }
+        spin_unlock(&nsm_lock);
+        new = nsm_create_handle(sap, salen, hostname, hostname_len);
+        if (unlikely(new == NULL))
+                return NULL;
+        goto retry;
+}
+/**
+ * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @info: pointer to NLMPROC_SM_NOTIFY arguments
+ *
+ * Returns a matching nsm_handle if found in the nsm cache; the returned
+ * nsm_handle's reference count is bumped and sm_monitored is cleared.
+ * Otherwise returns NULL if some error occurred.
+ */
+struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+{
+        struct nsm_handle *cached;
+        spin_lock(&nsm_lock);
+        cached = nsm_lookup_priv(&info->priv);
+        if (unlikely(cached == NULL)) {
+                spin_unlock(&nsm_lock);
+                dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+                                info->len, info->mon);
+                return cached;
+        }
+        atomic_inc(&cached->sm_count);
+        spin_unlock(&nsm_lock);
+        /*
+         * During subsequent lock activity, force a fresh
+         * notification to be set up for this host.
+         */
+        cached->sm_monitored = 0;
+        dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
+                        cached->sm_name, cached->sm_addrbuf,
+                        atomic_read(&cached->sm_count));
+        return cached;
+}
+/**
+ * nsm_release - Release an NSM handle
+ * @nsm: pointer to handle to be released
+ *
+ */
+void nsm_release(struct nsm_handle *nsm)
+{
+        if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
+                list_del(&nsm->sm_link);
+                spin_unlock(&nsm_lock);
+                dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
+                                nsm->sm_name, nsm->sm_addrbuf);
+                kfree(nsm);
+        }
 }
 /*
@@ -154,127 +428,132 @@ nsm_create(void)
 * Status Monitor wire protocol.
 */
-static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
+static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
-        size_t len = strlen(string);
+        const u32 len = strlen(string);
+        __be32 *p;
-        if (len > SM_MAXSTRLEN)
-                len = SM_MAXSTRLEN;
+        if (unlikely(len > SM_MAXSTRLEN))
-        return xdr_encode_opaque(p, string, len);
+                return -EIO;
+        p = xdr_reserve_space(xdr, sizeof(u32) + len);
+        if (unlikely(p == NULL))
+                return -EIO;
+        xdr_encode_opaque(p, string, len);
+        return 0;
 }
 /*
 * "mon_name" specifies the host to be monitored.
- *
- * Linux uses a text version of the IP address of the remote
- * host as the host identifier (the "mon_name" argument).
- *
- * Linux statd always looks up the canonical hostname first for
- * whatever remote hostname it receives, so this works alright.
 */
-static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
+static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        char    buffer[XDR_ADDRBUF_LEN + 1];
+        return encode_nsm_string(xdr, argp->mon_name);
-        char    *name = argp->mon_name;
-        if (!nsm_use_hostnames) {
-                snprintf(buffer, XDR_ADDRBUF_LEN,
-                         "%pI4", &argp->addr);
-                name = buffer;
-        }
-        return xdr_encode_nsm_string(p, name);
 }
 /*
 * The "my_id" argument specifies the hostname and RPC procedure
 * to be called when the status manager receives notification
- * (via the SM_NOTIFY call) that the state of host "mon_name"
+ * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
 * has changed.
 */
-static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
+static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        p = xdr_encode_nsm_string(p, utsname()->nodename);
+        int status;
-        if (!p)
+        __be32 *p;
-                return ERR_PTR(-EIO);
+        status = encode_nsm_string(xdr, utsname()->nodename);
+        if (unlikely(status != 0))
+                return status;
+        p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+        if (unlikely(p == NULL))
+                return -EIO;
        *p++ = htonl(argp->prog);
        *p++ = htonl(argp->vers);
        *p++ = htonl(argp->proc);
+        return 0;
-        return p;
 }
 /*
 * The "mon_id" argument specifies the non-private arguments
- * of an SM_MON or SM_UNMON call.
+ * of an NSMPROC_MON or NSMPROC_UNMON call.
 */
-static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
+static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        p = xdr_encode_mon_name(p, argp);
+        int status;
-        if (!p)
-                return ERR_PTR(-EIO);
-        return xdr_encode_my_id(p, argp);
+        status = encode_mon_name(xdr, argp);
+        if (unlikely(status != 0))
+                return status;
+        return encode_my_id(xdr, argp);
 }
 /*
 * The "priv" argument may contain private information required
- * by the SM_MON call. This information will be supplied in the
+ * by the NSMPROC_MON call. This information will be supplied in the
- * SM_NOTIFY call.
+ * NLMPROC_SM_NOTIFY call.
- *
- * Linux provides the raw IP address of the monitored host,
- * left in network byte order.
 */
-static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp)
+static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        *p++ = argp->addr;
+        __be32 *p;
-        *p++ = 0;
-        *p++ = 0;
-        *p++ = 0;
-        return p;
+        p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
+        if (unlikely(p == NULL))
+                return -EIO;
+        xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
+        return 0;
 }
-static int
+static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
-xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+                       const struct nsm_args *argp)
 {
-        p = xdr_encode_mon_id(p, argp);
+        struct xdr_stream xdr;
-        if (IS_ERR(p))
+        int status;
-                return PTR_ERR(p);
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        p = xdr_encode_priv(p, argp);
+        status = encode_mon_id(&xdr, argp);
-        if (IS_ERR(p))
+        if (unlikely(status))
-                return PTR_ERR(p);
+                return status;
+        return encode_priv(&xdr, argp);
-        rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
-        return 0;
 }
-static int
+static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
-xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+                         const struct nsm_args *argp)
 {
-        p = xdr_encode_mon_id(p, argp);
+        struct xdr_stream xdr;
-        if (IS_ERR(p))
-                return PTR_ERR(p);
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
+        return encode_mon_id(&xdr, argp);
-        return 0;
 }
-static int
+static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
-xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+                            struct nsm_res *resp)
 {
+        struct xdr_stream xdr;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
+        if (unlikely(p == NULL))
+                return -EIO;
        resp->status = ntohl(*p++);
-        resp->state = ntohl(*p++);
+        resp->state = ntohl(*p);
-        dprintk("nsm: xdr_decode_stat_res status %d state %d\n",
+        dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
                        resp->status, resp->state);
        return 0;
 }
-static int
+static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
-xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+                        struct nsm_res *resp)
 {
-        resp->state = ntohl(*p++);
+        struct xdr_stream xdr;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(&xdr, sizeof(u32));
+        if (unlikely(p == NULL))
+                return -EIO;
+        resp->state = ntohl(*p);
+        dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
        return 0;
 }
@@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 #define SM_unmonres_sz  1
 static struct rpc_procinfo      nsm_procedures[] = {
-[SM_MON] = {
+[NSMPROC_MON] = {
-                .p_proc         = SM_MON,
+                .p_proc         = NSMPROC_MON,
-                .p_encode       = (kxdrproc_t) xdr_encode_mon,
+                .p_encode       = (kxdrproc_t)xdr_enc_mon,
-                .p_decode       = (kxdrproc_t) xdr_decode_stat_res,
+                .p_decode       = (kxdrproc_t)xdr_dec_stat_res,
                .p_arglen       = SM_mon_sz,
                .p_replen       = SM_monres_sz,
-                .p_statidx      = SM_MON,
+                .p_statidx      = NSMPROC_MON,
                .p_name         = "MONITOR",
        },
-[SM_UNMON] = {
+[NSMPROC_UNMON] = {
-                .p_proc         = SM_UNMON,
+                .p_proc         = NSMPROC_UNMON,
-                .p_encode       = (kxdrproc_t) xdr_encode_unmon,
+                .p_encode       = (kxdrproc_t)xdr_enc_unmon,
-                .p_decode       = (kxdrproc_t) xdr_decode_stat,
+                .p_decode       = (kxdrproc_t)xdr_dec_stat,
                .p_arglen       = SM_mon_id_sz,
                .p_replen       = SM_unmonres_sz,
-                .p_statidx      = SM_UNMON,
+                .p_statidx      = NSMPROC_UNMON,
                .p_name         = "UNMONITOR",
        },
 };
@@ -322,7 +601,7 @@ static struct rpc_stat		nsm_stats;
 static struct rpc_program       nsm_program = {
                .name           = "statd",
-                .number         = SM_PROGRAM,
+                .number         = NSM_PROGRAM,
                .nrvers         = ARRAY_SIZE(nsm_version),
                .version        = nsm_version,
                .stats          = &nsm_stats
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 252d80163d02..64f1c31b5853 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,7 +35,6 @@
 #include <linux/sunrpc/svcsock.h>
 #include <net/ip.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/nfs.h>
 #define NLMDBG_FACILITY         NLMDBG_SVC
@@ -54,13 +53,26 @@ static struct svc_rqst		*nlmsvc_rqst;
 unsigned long                   nlmsvc_timeout;
 /*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
+        defined(CONFIG_SUNRPC_REGISTER_V4)
+static const sa_family_t        nlmsvc_family = AF_INET6;
+#else   /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+static const sa_family_t        nlmsvc_family = AF_INET;
+#endif  /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+/*
 * These can be set at insmod time (useful for NFS as root filesystem),
 * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
 */
 static unsigned long            nlm_grace_period;
 static unsigned long            nlm_timeout = LOCKD_DFLT_TIMEO;
 static int                      nlm_udpport, nlm_tcpport;
-int                             nsm_use_hostnames = 0;
+/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
+static unsigned int             nlm_max_connections = 1024;
 /*
 * Constants needed for the sysctl interface.
@@ -143,6 +155,9 @@ lockd(void *vrqstp)
                long timeout = MAX_SCHEDULE_TIMEOUT;
                RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+                /* update sv_maxconn if it has changed */
+                rqstp->rq_server->sv_maxconn = nlm_max_connections;
                if (signalled()) {
                        flush_signals(current);
                        if (nlmsvc_ops) {
@@ -189,6 +204,19 @@ lockd(void *vrqstp)
        return 0;
 }
+static int create_lockd_listener(struct svc_serv *serv, char *name,
+                                 unsigned short port)
+{
+        struct svc_xprt *xprt;
+        xprt = svc_find_xprt(serv, name, 0, 0);
+        if (xprt == NULL)
+                return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+        svc_xprt_put(xprt);
+        return 0;
+}
 /*
 * Ensure there are active UDP and TCP listeners for lockd.
 *
@@ -202,29 +230,23 @@ lockd(void *vrqstp)
 static int make_socks(struct svc_serv *serv)
 {
        static int warned;
-        struct svc_xprt *xprt;
+        int err;
-        int err = 0;
-        xprt = svc_find_xprt(serv, "udp", 0, 0);
+        err = create_lockd_listener(serv, "udp", nlm_udpport);
-        if (!xprt)
+        if (err < 0)
-                err = svc_create_xprt(serv, "udp", nlm_udpport,
+                goto out_err;
-                                      SVC_SOCK_DEFAULTS);
-        else
+        err = create_lockd_listener(serv, "tcp", nlm_tcpport);
-                svc_xprt_put(xprt);
+        if (err < 0)
-        if (err >= 0) {
+                goto out_err;
-                xprt = svc_find_xprt(serv, "tcp", 0, 0);
-                if (!xprt)
+        warned = 0;
-                        err = svc_create_xprt(serv, "tcp", nlm_tcpport,
+        return 0;
-                                              SVC_SOCK_DEFAULTS);
-                else
+out_err:
-                        svc_xprt_put(xprt);
+        if (warned++ == 0)
-        }
-        if (err >= 0) {
-                warned = 0;
-                err = 0;
-        } else if (warned++ == 0)
                printk(KERN_WARNING
-                       "lockd_up: makesock failed, error=%d\n", err);
+                        "lockd_up: makesock failed, error=%d\n", err);
        return err;
 }
@@ -252,7 +274,7 @@ int lockd_up(void)
                        "lockd_up: no pid, %d users??\n", nlmsvc_users);
        error = -ENOMEM;
-        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
+        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
        if (!serv) {
                printk(KERN_WARNING "lockd_up: create service failed\n");
                goto out;
@@ -276,6 +298,7 @@ int lockd_up(void)
        }
        svc_sock_update_bufs(serv);
+        serv->sv_maxconn = nlm_max_connections;
        nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
        if (IS_ERR(nlmsvc_task)) {
@@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
 module_param_call(nlm_tcpport, param_set_port, param_get_int,
                  &nlm_tcpport, 0644);
 module_param(nsm_use_hostnames, bool, 0644);
+module_param(nlm_max_connections, uint, 0644);
 /*
 * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4dfdcbc6bf68..1725037374c5 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -419,8 +417,6 @@ static __be32
 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                                              void              *resp)
 {
-        struct sockaddr_in      saddr;
        dprintk("lockd: SM_NOTIFY     called\n");
        if (!nlm_privileged_requester(rqstp)) {
@@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                return rpc_system_err;
        }
-        /* Obtain the host pointer for this NFS server and try to
+        nlm_host_rebooted(argp);
-         * reclaim all locks we hold on this server.
-         */
-        memset(&saddr, 0, sizeof(saddr));
-        saddr.sin_family = AF_INET;
-        saddr.sin_addr.s_addr = argp->addr;
-        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
        return rpc_success;
 }
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3ca89e2a9381..3688e55901fc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -451,8 +449,6 @@ static __be32
 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                                              void              *resp)
 {
-        struct sockaddr_in      saddr;
        dprintk("lockd: SM_NOTIFY     called\n");
        if (!nlm_privileged_requester(rqstp)) {
@@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                return rpc_system_err;
        }
-        /* Obtain the host pointer for this NFS server and try to
+        nlm_host_rebooted(argp);
-         * reclaim all locks we hold on this server.
-         */
-        memset(&saddr, 0, sizeof(saddr));
-        saddr.sin_family = AF_INET;
-        saddr.sin_addr.s_addr = argp->addr;
-        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
        return rpc_success;
 }
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 34c2766e27c7..9e4d6aab611b 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -17,7 +17,6 @@
 #include <linux/nfsd/export.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/module.h>
 #include <linux/mount.h>
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1f226290c67c..0336f2beacde 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_XDR
@@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
        if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
                return 0;
        argp->state = ntohl(*p++);
-        /* Preserve the address in network byte order */
+        memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
-        argp->addr = *p++;
+        p += XDR_QUADLEN(SM_PRIV_SIZE);
        return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 50c493a8ad8e..e1d528653192 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -17,7 +17,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_XDR
@@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
        if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
                return 0;
        argp->state = ntohl(*p++);
-        /* Preserve the address in network byte order */
+        memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
-        argp->addr  = *p++;
+        p += XDR_QUADLEN(SM_PRIV_SIZE);
        return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/locks.c b/fs/locks.c
index 46a2e12f7d42..ec3deea29e37 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1564,7 +1564,7 @@ EXPORT_SYMBOL(flock_lock_file_wait);
 *      %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
 *      processes read and write access respectively.
 */
-asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
+SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 {
        struct file *filp;
        struct file_lock *lock;
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
new file mode 100644
index 000000000000..0fd7ca994264
--- /dev/null
+++ b/fs/minix/Kconfig
@@ -0,0 +1,17 @@
+config MINIX_FS
+        tristate "Minix file system support"
+        depends on BLOCK
+        help
+          Minix is a simple operating system used in many classes about OS's.
+          The minix file system (method to organize files on a hard disk
+          partition or a floppy disk) was the original file system for Linux,
+          but has been superseded by the second extended file system ext2fs.
+          You don't want to use the minix file system on your hard disk
+          because of certain built-in restrictions, but it is sometimes found
+          on older Linux floppy disks.  This option will enlarge your kernel
+          by about 28 KB. If unsure, say N.
+          To compile this file system support as a module, choose M here: the
+          module will be called minix.  Note that the file system of your root
+          partition (the one containing the directory /) cannot be compiled as
+          a module.
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index f70433816a38..d4946c4c90e2 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
        return -EINVAL;
 got_it:
-        pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page);
+        pos = page_offset(page) + p - (char *)page_address(page);
        err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
                                        AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err)
diff --git a/fs/mpage.c b/fs/mpage.c
index 552b80b3facc..16c3ef37eae3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
                                first_hole = page_block;
                        page_block++;
                        block_in_file++;
-                        clear_buffer_mapped(map_bh);
                        continue;
                }
@@ -308,7 +307,10 @@ alloc_new:
                goto alloc_new;
        }
-        if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
+        relative_block = block_in_file - *first_logical_block;
+        nblocks = map_bh->b_size >> blkbits;
+        if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
+            (first_hole != blocks_per_page))
                bio = mpage_bio_submit(READ, bio);
        else
                *last_block_in_bio = blocks[blocks_per_page - 1];
diff --git a/fs/namei.c b/fs/namei.c
index 734f2b5591bf..199317642ad6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -227,6 +227,16 @@ int generic_permission(struct inode *inode, int mask,
        return -EACCES;
 }
+/**
+ * inode_permission  -  check for access rights to a given inode
+ * @inode:      inode to check permission on
+ * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Used to check for read/write/execute permissions on an inode.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ */
 int inode_permission(struct inode *inode, int mask)
 {
        int retval;
@@ -248,8 +258,7 @@ int inode_permission(struct inode *inode, int mask)
                        return -EACCES;
        }
-        /* Ordinary permission routines do not understand MAY_APPEND. */
+        if (inode->i_op->permission)
-        if (inode->i_op && inode->i_op->permission)
                retval = inode->i_op->permission(inode, mask);
        else
                retval = generic_permission(inode, mask, NULL);
@@ -266,21 +275,6 @@ int inode_permission(struct inode *inode, int mask)
 }
 /**
- * vfs_permission  -  check for access rights to a given path
- * @nd:         lookup result that describes the path
- * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Used to check for read/write/execute permissions on a path.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things.
- */
-int vfs_permission(struct nameidata *nd, int mask)
-{
-        return inode_permission(nd->path.dentry->d_inode, mask);
-}
-/**
 * file_permission  -  check for additional access rights to a given file
 * @file:       file to check access rights for
 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
@@ -290,7 +284,7 @@ int vfs_permission(struct nameidata *nd, int mask)
 *
 * Note:
 *      Do not use this function in new code.  All access checks should
- *      be done using vfs_permission().
+ *      be done using inode_permission().
 */
 int file_permission(struct file *file, int mask)
 {
@@ -439,7 +433,7 @@ static int exec_permission_lite(struct inode *inode)
 {
        umode_t mode = inode->i_mode;
-        if (inode->i_op && inode->i_op->permission)
+        if (inode->i_op->permission)
                return -EAGAIN;
        if (current_fsuid() == inode->i_uid)
@@ -528,18 +522,6 @@ out_unlock:
        return result;
 }
-/* SMP-safe */
-static __always_inline void
-walk_init_root(const char *name, struct nameidata *nd)
-{
-        struct fs_struct *fs = current->fs;
-        read_lock(&fs->lock);
-        nd->path = fs->root;
-        path_get(&fs->root);
-        read_unlock(&fs->lock);
-}
 /*
 * Wrapper to retry pathname resolution whenever the underlying
 * file system returns an ESTALE.
@@ -577,9 +559,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                goto fail;
        if (*link == '/') {
+                struct fs_struct *fs = current->fs;
                path_put(&nd->path);
-                walk_init_root(link, nd);
+                read_lock(&fs->lock);
+                nd->path = fs->root;
+                path_get(&fs->root);
+                read_unlock(&fs->lock);
        }
        res = link_path_walk(link, nd);
        if (nd->depth || res || nd->last_type!=LAST_NORM)
                return res;
@@ -860,7 +849,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                nd->flags |= LOOKUP_CONTINUE;
                err = exec_permission_lite(inode);
                if (err == -EAGAIN)
-                        err = vfs_permission(nd, MAY_EXEC);
+                        err = inode_permission(nd->path.dentry->d_inode,
+                                               MAY_EXEC);
                if (!err)
                        err = ima_path_check(&nd->path, MAY_EXEC);
                if (err)
@@ -921,9 +911,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                inode = next.dentry->d_inode;
                if (!inode)
                        goto out_dput;
-                err = -ENOTDIR; 
-                if (!inode->i_op)
-                        goto out_dput;
                if (inode->i_op->follow_link) {
                        err = do_follow_link(&next, nd);
@@ -933,9 +920,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        inode = nd->path.dentry->d_inode;
                        if (!inode)
                                break;
-                        err = -ENOTDIR; 
-                        if (!inode->i_op)
-                                break;
                } else
                        path_to_nameidata(&next, nd);
                err = -ENOTDIR; 
@@ -974,7 +958,7 @@ last_component:
                        break;
                inode = next.dentry->d_inode;
                if ((lookup_flags & LOOKUP_FOLLOW)
-                    && inode && inode->i_op && inode->i_op->follow_link) {
+                    && inode && inode->i_op->follow_link) {
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
@@ -986,7 +970,7 @@ last_component:
                        break;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
-                        if (!inode->i_op || !inode->i_op->lookup)
+                        if (!inode->i_op->lookup)
                                break;
                }
                goto return_base;
@@ -1482,7 +1466,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->create)
+        if (!dir->i_op->create)
                return -EACCES; /* shouldn't it be ENOSYS? */
        mode &= S_IALLUGO;
        mode |= S_IFREG;
@@ -1496,9 +1480,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        return error;
 }
-int may_open(struct nameidata *nd, int acc_mode, int flag)
+int may_open(struct path *path, int acc_mode, int flag)
 {
-        struct dentry *dentry = nd->path.dentry;
+        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;
@@ -1519,17 +1503,17 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
        if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                flag &= ~O_TRUNC;
        } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
-                if (nd->path.mnt->mnt_flags & MNT_NODEV)
+                if (path->mnt->mnt_flags & MNT_NODEV)
                        return -EACCES;
                flag &= ~O_TRUNC;
        }
-        error = vfs_permission(nd, acc_mode);
+        error = inode_permission(inode, acc_mode);
        if (error)
                return error;
-        error = ima_path_check(&nd->path,
+        error = ima_path_check(path,
                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
        if (error)
                return error;
@@ -1564,6 +1548,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
                 * Refuse to truncate files with mandatory locks held on them.
                 */
                error = locks_verify_locked(inode);
+                if (!error)
+                        error = security_path_truncate(path, 0,
+                                               ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
                if (!error) {
                        DQUOT_INIT(inode);
@@ -1594,14 +1581,18 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
        if (!IS_POSIXACL(dir->d_inode))
                mode &= ~current->fs->umask;
+        error = security_path_mknod(&nd->path, path->dentry, mode, 0);
+        if (error)
+                goto out_unlock;
        error = vfs_create(dir->d_inode, path->dentry, mode, nd);
+out_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
        dput(nd->path.dentry);
        nd->path.dentry = path->dentry;
        if (error)
                return error;
        /* Don't check for write permission, don't truncate */
-        return may_open(nd, 0, flag & ~O_TRUNC);
+        return may_open(&nd->path, 0, flag & ~O_TRUNC);
 }
 /*
@@ -1763,7 +1754,7 @@ do_last:
        error = -ENOENT;
        if (!path.dentry->d_inode)
                goto exit_dput;
-        if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
+        if (path.dentry->d_inode->i_op->follow_link)
                goto do_link;
        path_to_nameidata(&path, &nd);
@@ -1787,7 +1778,7 @@ ok:
                if (error)
                        goto exit;
        }
-        error = may_open(&nd, acc_mode, flag);
+        error = may_open(&nd.path, acc_mode, flag);
        if (error) {
                if (will_write)
                        mnt_drop_write(nd.path.mnt);
@@ -1944,7 +1935,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
                return -EPERM;
-        if (!dir->i_op || !dir->i_op->mknod)
+        if (!dir->i_op->mknod)
                return -EPERM;
        error = devcgroup_inode_mknod(mode, dev);
@@ -1979,8 +1970,8 @@ static int may_mknod(mode_t mode)
        }
 }
-asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
+SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
-                                unsigned dev)
+                unsigned, dev)
 {
        int error;
        char *tmp;
@@ -2007,6 +1998,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_mknod(&nd.path, dentry, mode, dev);
+        if (error)
+                goto out_drop_write;
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
@@ -2019,6 +2013,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
                        error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
                        break;
        }
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
@@ -2030,7 +2025,7 @@ out_unlock:
        return error;
 }
-asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
+SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
 {
        return sys_mknodat(AT_FDCWD, filename, mode, dev);
 }
@@ -2042,7 +2037,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->mkdir)
+        if (!dir->i_op->mkdir)
                return -EPERM;
        mode &= (S_IRWXUGO|S_ISVTX);
@@ -2057,7 +2052,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return error;
 }
-asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
+SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
 {
        int error = 0;
        char * tmp;
@@ -2078,7 +2073,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_mkdir(&nd.path, dentry, mode);
+        if (error)
+                goto out_drop_write;
        error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
@@ -2090,7 +2089,7 @@ out_err:
        return error;
 }
-asmlinkage long sys_mkdir(const char __user *pathname, int mode)
+SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 {
        return sys_mkdirat(AT_FDCWD, pathname, mode);
 }
@@ -2129,7 +2128,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->rmdir)
+        if (!dir->i_op->rmdir)
                return -EPERM;
        DQUOT_INIT(dir);
@@ -2188,7 +2187,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto exit3;
+        error = security_path_rmdir(&nd.path, dentry);
+        if (error)
+                goto exit4;
        error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+exit4:
        mnt_drop_write(nd.path.mnt);
 exit3:
        dput(dentry);
@@ -2200,7 +2203,7 @@ exit1:
        return error;
 }
-asmlinkage long sys_rmdir(const char __user *pathname)
+SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 {
        return do_rmdir(AT_FDCWD, pathname);
 }
@@ -2212,7 +2215,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->unlink)
+        if (!dir->i_op->unlink)
                return -EPERM;
        DQUOT_INIT(dir);
@@ -2273,7 +2276,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
+                error = security_path_unlink(&nd.path, dentry);
+                if (error)
+                        goto exit3;
                error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+exit3:
                mnt_drop_write(nd.path.mnt);
        exit2:
                dput(dentry);
@@ -2292,7 +2299,7 @@ slashes:
        goto exit2;
 }
-asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
+SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
 {
        if ((flag & ~AT_REMOVEDIR) != 0)
                return -EINVAL;
@@ -2303,7 +2310,7 @@ asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
        return do_unlinkat(dfd, pathname);
 }
-asmlinkage long sys_unlink(const char __user *pathname)
+SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 {
        return do_unlinkat(AT_FDCWD, pathname);
 }
@@ -2315,7 +2322,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->symlink)
+        if (!dir->i_op->symlink)
                return -EPERM;
        error = security_inode_symlink(dir, dentry, oldname);
@@ -2329,8 +2336,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
        return error;
 }
-asmlinkage long sys_symlinkat(const char __user *oldname,
+SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
-                              int newdfd, const char __user *newname)
+                int, newdfd, const char __user *, newname)
 {
        int error;
        char *from;
@@ -2354,7 +2361,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_symlink(&nd.path, dentry, from);
+        if (error)
+                goto out_drop_write;
        error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
@@ -2367,7 +2378,7 @@ out_putname:
        return error;
 }
-asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
+SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
 {
        return sys_symlinkat(oldname, AT_FDCWD, newname);
 }
@@ -2392,7 +2403,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
-        if (!dir->i_op || !dir->i_op->link)
+        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
@@ -2419,9 +2430,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
-asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
+SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
-                           int newdfd, const char __user *newname,
+                int, newdfd, const char __user *, newname, int, flags)
-                           int flags)
 {
        struct dentry *new_dentry;
        struct nameidata nd;
@@ -2451,7 +2461,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+        if (error)
+                goto out_drop_write;
        error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(new_dentry);
@@ -2466,7 +2480,7 @@ out:
        return error;
 }
-asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
+SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
 {
        return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 }
@@ -2595,7 +2609,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
-        if (!old_dir->i_op || !old_dir->i_op->rename)
+        if (!old_dir->i_op->rename)
                return -EPERM;
        DQUOT_INIT(old_dir);
@@ -2617,8 +2631,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        return error;
 }
-asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
+SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
-                             int newdfd, const char __user *newname)
+                int, newdfd, const char __user *, newname)
 {
        struct dentry *old_dir, *new_dir;
        struct dentry *old_dentry, *new_dentry;
@@ -2687,8 +2701,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
        error = mnt_want_write(oldnd.path.mnt);
        if (error)
                goto exit5;
+        error = security_path_rename(&oldnd.path, old_dentry,
+                                     &newnd.path, new_dentry);
+        if (error)
+                goto exit6;
        error = vfs_rename(old_dir->d_inode, old_dentry,
                                   new_dir->d_inode, new_dentry);
+exit6:
        mnt_drop_write(oldnd.path.mnt);
 exit5:
        dput(new_dentry);
@@ -2706,7 +2725,7 @@ exit:
        return error;
 }
-asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
+SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
 {
        return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
 }
@@ -2758,13 +2777,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link)
 /* get the link contents into pagecache */
 static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
-        struct page * page;
+        char *kaddr;
+        struct page *page;
        struct address_space *mapping = dentry->d_inode->i_mapping;
        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
                return (char*)page;
        *ppage = page;
-        return kmap(page);
+        kaddr = kmap(page);
+        nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+        return kaddr;
 }
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
@@ -2796,18 +2818,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
        }
 }
-int __page_symlink(struct inode *inode, const char *symname, int len,
+/*
-                gfp_t gfp_mask)
+ * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+ */
+int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
 {
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
        void *fsdata;
        int err;
        char *kaddr;
+        unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+        if (nofs)
+                flags |= AOP_FLAG_NOFS;
 retry:
        err = pagecache_write_begin(NULL, mapping, 0, len-1,
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+                                flags, &page, &fsdata);
        if (err)
                goto fail;
@@ -2831,7 +2858,7 @@ fail:
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
        return __page_symlink(inode, symname, len,
-                        mapping_gfp_mask(inode->i_mapping));
+                        !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
 }
 const struct inode_operations page_symlink_inode_operations = {
@@ -2857,7 +2884,6 @@ EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
 EXPORT_SYMBOL(vfs_create);
@@ -2873,3 +2899,10 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+        .count          = ATOMIC_INIT(1),
+        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
+        .umask          = 0022,
+};
diff --git a/fs/namespace.c b/fs/namespace.c
index 1c09cab8f7cf..228d8c4bfd18 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1128,7 +1128,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
 */
-asmlinkage long sys_umount(char __user * name, int flags)
+SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
        struct path path;
        int retval;
@@ -1160,7 +1160,7 @@ out:
 /*
 *      The 2.0 compatible umount. No flags.
 */
-asmlinkage long sys_oldumount(char __user * name)
+SYSCALL_DEFINE1(oldumount, char __user *, name)
 {
        return sys_umount(name, 0);
 }
@@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        if (!new_ns->root) {
                up_write(&namespace_sem);
                kfree(new_ns);
-                return ERR_PTR(-ENOMEM);;
+                return ERR_PTR(-ENOMEM);
        }
        spin_lock(&vfsmount_lock);
        list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
@@ -2045,9 +2045,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
        return new_ns;
 }
-asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
+SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
-                          char __user * type, unsigned long flags,
+                char __user *, type, unsigned long, flags, void __user *, data)
-                          void __user * data)
 {
        int retval;
        unsigned long data_page;
@@ -2172,8 +2171,8 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root)
 *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
 *    first.
 */
-asmlinkage long sys_pivot_root(const char __user * new_root,
+SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
-                               const char __user * put_old)
+                const char __user *, put_old)
 {
        struct vfsmount *tmp;
        struct path new, old, parent_path, root_parent, root;
diff --git a/fs/ncpfs/Kconfig b/fs/ncpfs/Kconfig
index 142808427b25..c931cf22a1f6 100644
--- a/fs/ncpfs/Kconfig
+++ b/fs/ncpfs/Kconfig
@@ -1,6 +1,27 @@
 #
 # NCP Filesystem configuration
 #
+config NCP_FS
+        tristate "NCP file system support (to mount NetWare volumes)"
+        depends on IPX!=n || INET
+        help
+          NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
+          used by Novell NetWare clients to talk to file servers.  It is to
+          IPX what NFS is to TCP/IP, if that helps.  Saying Y here allows you
+          to mount NetWare file server volumes and to access them just like
+          any other Unix directory.  For details, please read the file
+          <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
+          the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
+          You do not have to say Y here if you want your Linux box to act as a
+          file *server* for Novell NetWare clients.
+          General information about how to connect Linux, Windows machines and
+          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+          To compile this as a module, choose M here: the module will be called
+          ncpfs.  Say N unless you are connected to a Novell network.
 config NCPFS_PACKET_SIGNING
        bool "Packet signatures"
        depends on NCP_FS
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 335b003dddf9..0af3349de851 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -16,7 +16,6 @@
 *      @opts: an array of &struct option entries controlling parser operations
 *      @optopt: output; will contain the current option
 *      @optarg: output; will contain the value (if one exists)
- *      @flag: output; may be NULL; should point to a long for or'ing flags
 *      @value: output; may be NULL; will be overwritten with the integer value
 *              of the current argument.
 *
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6d04e050c74e..f54360f50a9c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl
 {
        s32             auth_type;
        u32             object_name_len;
-        compat_caddr_t  object_name;    /* an userspace data, in most cases user name */
+        compat_caddr_t  object_name;    /* a userspace data, in most cases user name */
 };
 struct compat_ncp_fs_info_v2 {
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
new file mode 100644
index 000000000000..36fe20d6eba2
--- /dev/null
+++ b/fs/nfs/Kconfig
@@ -0,0 +1,86 @@
+config NFS_FS
+        tristate "NFS client support"
+        depends on INET
+        select LOCKD
+        select SUNRPC
+        select NFS_ACL_SUPPORT if NFS_V3_ACL
+        help
+          Choose Y here if you want to access files residing on other
+          computers using Sun's Network File System protocol.  To compile
+          this file system support as a module, choose M here: the module
+          will be called nfs.
+          To mount file systems exported by NFS servers, you also need to
+          install the user space mount.nfs command which can be found in
+          the Linux nfs-utils package, available from http://linux-nfs.org/.
+          Information about using the mount command is available in the
+          mount(8) man page.  More detail about the Linux NFS client
+          implementation is available via the nfs(5) man page.
+          Below you can choose which versions of the NFS protocol are
+          available in the kernel to mount NFS servers.  Support for NFS
+          version 2 (RFC 1094) is always available when NFS_FS is selected.
+          To configure a system which mounts its root file system via NFS
+          at boot time, say Y here, select "Kernel level IP
+          autoconfiguration" in the NETWORK menu, and select "Root file
+          system on NFS" below.  You cannot compile this file system as a
+          module in this case.
+          If unsure, say N.
+config NFS_V3
+        bool "NFS client support for NFS version 3"
+        depends on NFS_FS
+        help
+          This option enables support for version 3 of the NFS protocol
+          (RFC 1813) in the kernel's NFS client.
+          If unsure, say Y.
+config NFS_V3_ACL
+        bool "NFS client support for the NFSv3 ACL protocol extension"
+        depends on NFS_V3
+        help
+          Some NFS servers support an auxiliary NFSv3 ACL protocol that
+          Sun added to Solaris but never became an official part of the
+          NFS version 3 protocol.  This protocol extension allows
+          applications on NFS clients to manipulate POSIX Access Control
+          Lists on files residing on NFS servers.  NFS servers enforce
+          ACLs on local files whether this protocol is available or not.
+          Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+          protocol extension and you want your NFS client to allow
+          applications to access and modify ACLs on files on the server.
+          Most NFS servers don't support the Solaris NFSv3 ACL protocol
+          extension.  You can choose N here or specify the "noacl" mount
+          option to prevent your NFS client from trying to use the NFSv3
+          ACL protocol.
+          If unsure, say N.
+config NFS_V4
+        bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
+        depends on NFS_FS && EXPERIMENTAL
+        select RPCSEC_GSS_KRB5
+        help
+          This option enables support for version 4 of the NFS protocol
+          (RFC 3530) in the kernel's NFS client.
+          To mount NFS servers using NFSv4, you also need to install user
+          space programs which can be found in the Linux nfs-utils package,
+          available from http://linux-nfs.org/.
+          If unsure, say N.
+config ROOT_NFS
+        bool "Root file system on NFS"
+        depends on NFS_FS=y && IP_PNP
+        help
+          If you want your system to mount its root file system via NFS,
+          choose Y here.  This is common practice for managing systems
+          without local permanent storage.  For details, read
+          <file:Documentation/filesystems/nfsroot.txt>.
+          Most people say N here.
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d319b49f8f06..90f292b520d2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b1acbd6ab6fb..8f9a20556f79 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags)
                return ERR_PTR(error);
        if (flags == O_RDWR)
-                error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE);
+                error = may_open(&nd.path, MAY_READ|MAY_WRITE,
+                                           FMODE_READ|FMODE_WRITE);
        else
-                error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
+                error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
        if (!error)
                return dentry_open(nd.path.dentry, nd.path.mnt, flags,
@@ -85,8 +86,8 @@ static struct {
        },
 };
-long
+SYSCALL_DEFINE3(nfsservctl, int, cmd, struct nfsctl_arg __user *, arg,
-asmlinkage sys_nfsservctl(int cmd, struct nfsctl_arg __user *arg, void __user *res)
+                void __user *, res)
 {
        struct file *file;
        void __user *p = &arg->u;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
new file mode 100644
index 000000000000..44d7d04dab95
--- /dev/null
+++ b/fs/nfsd/Kconfig
@@ -0,0 +1,80 @@
+config NFSD
+        tristate "NFS server support"
+        depends on INET
+        select LOCKD
+        select SUNRPC
+        select EXPORTFS
+        select NFS_ACL_SUPPORT if NFSD_V2_ACL
+        help
+          Choose Y here if you want to allow other computers to access
+          files residing on this system using Sun's Network File System
+          protocol.  To compile the NFS server support as a module,
+          choose M here: the module will be called nfsd.
+          You may choose to use a user-space NFS server instead, in which
+          case you can choose N here.
+          To export local file systems using NFS, you also need to install
+          user space programs which can be found in the Linux nfs-utils
+          package, available from http://linux-nfs.org/.  More detail about
+          the Linux NFS server implementation is available via the
+          exports(5) man page.
+          Below you can choose which versions of the NFS protocol are
+          available to clients mounting the NFS server on this system.
+          Support for NFS version 2 (RFC 1094) is always available when
+          CONFIG_NFSD is selected.
+          If unsure, say N.
+config NFSD_V2_ACL
+        bool
+        depends on NFSD
+config NFSD_V3
+        bool "NFS server support for NFS version 3"
+        depends on NFSD
+        help
+          This option enables support in your system's NFS server for
+          version 3 of the NFS protocol (RFC 1813).
+          If unsure, say Y.
+config NFSD_V3_ACL
+        bool "NFS server support for the NFSv3 ACL protocol extension"
+        depends on NFSD_V3
+        select NFSD_V2_ACL
+        help
+          Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
+          never became an official part of the NFS version 3 protocol.
+          This protocol extension allows applications on NFS clients to
+          manipulate POSIX Access Control Lists on files residing on NFS
+          servers.  NFS servers enforce POSIX ACLs on local files whether
+          this protocol is available or not.
+          This option enables support in your system's NFS server for the
+          NFSv3 ACL protocol extension allowing NFS clients to manipulate
+          POSIX ACLs on files exported by your system's NFS server.  NFS
+          clients which support the Solaris NFSv3 ACL protocol can then
+          access and modify ACLs on your NFS server.
+          To store ACLs on your NFS server, you also need to enable ACL-
+          related CONFIG options for your local file systems of choice.
+          If unsure, say N.
+config NFSD_V4
+        bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
+        depends on NFSD && PROC_FS && EXPERIMENTAL
+        select NFSD_V3
+        select FS_POSIX_ACL
+        select RPCSEC_GSS_KRB5
+        help
+          This option enables support in your system's NFS server for
+          version 4 of the NFS protocol (RFC 3530).
+          To export files using NFSv4, you need to install additional user
+          space programs which can be found in the Linux nfs-utils package,
+          available from http://linux-nfs.org/.
+          If unsure, say N.
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 0184fe9b514c..5573508f707f 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -49,6 +49,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
                new->fsuid = exp->ex_anon_uid;
                new->fsgid = exp->ex_anon_gid;
                gi = groups_alloc(0);
+                if (!gi)
+                        goto oom;
        } else if (flags & NFSEXP_ROOTSQUASH) {
                if (!new->fsuid)
                        new->fsuid = exp->ex_anon_uid;
@@ -76,15 +78,16 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
        ret = set_groups(new, gi);
        put_group_info(gi);
-        if (!ret)
+        if (ret < 0)
                goto error;
-        if (new->uid)
+        if (new->fsuid)
                new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
        else
                new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
                                                        new->cap_permitted);
        put_cred(override_creds(new));
+        put_cred(new);
        return 0;
 oom:
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6d7d8c02c197..c464181b5994 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,9 +53,6 @@
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-/* declarations */
-static const struct rpc_call_ops nfs4_cb_null_ops;
 /* Index of predefined Linux callback client operations */
 enum {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 669461e291ae..9fa60a3ad48c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -946,6 +946,11 @@ encode_op:
                        nfsd4_encode_operation(resp, op);
                        status = op->status;
                }
+                dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
+                        args->ops, args->opcnt, resp->opcnt, op->opnum,
+                        be32_to_cpu(status));
                if (cstate->replay_owner) {
                        nfs4_put_stateowner(cstate->replay_owner);
                        cstate->replay_owner = NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0f9d6efaa62b..74f7b67567fd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
        md5_to_hex(dname, cksum.data);
-        kfree(cksum.data);
        status = nfs_ok;
 out:
+        kfree(cksum.data);
        crypto_free_hash(desc.tfm);
 out_no_tfm:
        return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 13e0e074dbb8..b6f60f48e94b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2416,6 +2416,26 @@ out:
 #define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
 #define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
+static inline u64
+end_offset(u64 start, u64 len)
+{
+        u64 end;
+        end = start + len;
+        return end >= start ? end: NFS4_MAX_UINT64;
+}
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+        u64 end;
+        BUG_ON(!len);
+        end = start + len;
+        return end > start ? end - 1: NFS4_MAX_UINT64;
+}
 #define lockownerid_hashval(id) \
        ((id) & LOCK_HASH_MASK)
@@ -2435,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
 static struct nfs4_stateid *
 find_stateid(stateid_t *stid, int flags)
 {
-        struct nfs4_stateid *local = NULL;
+        struct nfs4_stateid *local;
        u32 st_id = stid->si_stateownerid;
        u32 f_id = stid->si_fileid;
        unsigned int hashval;
        dprintk("NFSD: find_stateid flags 0x%x\n",flags);
-        if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+        if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
                hashval = stateid_hashval(st_id, f_id);
                list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
                        if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2449,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags)
                                return local;
                }
        } 
-        if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+        if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
                hashval = stateid_hashval(st_id, f_id);
                list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
                        if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2518,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
                deny->ld_clientid.cl_id = 0;
        }
        deny->ld_start = fl->fl_start;
-        deny->ld_length = ~(u64)0;
+        deny->ld_length = NFS4_MAX_UINT64;
-        if (fl->fl_end != ~(u64)0)
+        if (fl->fl_end != NFS4_MAX_UINT64)
                deny->ld_length = fl->fl_end - fl->fl_start + 1;        
        deny->ld_type = NFS4_READ_LT;
        if (fl->fl_type != F_RDLCK)
@@ -2616,7 +2637,7 @@ out:
 static int
 check_lock_length(u64 offset, u64 length)
 {
-        return ((length == 0)  || ((length != ~(u64)0) &&
+        return ((length == 0)  || ((length != NFS4_MAX_UINT64) &&
             LOFF_OVERFLOW(offset, length)));
 }
@@ -2736,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        file_lock.fl_lmops = &nfsd_posix_mng_ops;
        file_lock.fl_start = lock->lk_offset;
-        if ((lock->lk_length == ~(u64)0) || 
+        file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
-                        LOFF_OVERFLOW(lock->lk_offset, lock->lk_length))
-                file_lock.fl_end = ~(u64)0;
-        else
-                file_lock.fl_end = lock->lk_offset + lock->lk_length - 1;
        nfs4_transform_lock_offset(&file_lock);
        /*
@@ -2781,6 +2798,25 @@ out:
 }
 /*
+ * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
+ * so we do a temporary open here just to get an open file to pass to
+ * vfs_test_lock.  (Arguably perhaps test_lock should be done with an
+ * inode operation.)
+ */
+static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+{
+        struct file *file;
+        int err;
+        err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+        if (err)
+                return err;
+        err = vfs_test_lock(file, lock);
+        nfsd_close(file);
+        return err;
+}
+/*
 * LOCKT operation
 */
 __be32
@@ -2788,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_lockt *lockt)
 {
        struct inode *inode;
-        struct file file;
        struct file_lock file_lock;
        int error;
        __be32 status;
@@ -2836,26 +2871,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
        file_lock.fl_pid = current->tgid;
        file_lock.fl_flags = FL_POSIX;
-        file_lock.fl_lmops = &nfsd_posix_mng_ops;
        file_lock.fl_start = lockt->lt_offset;
-        if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
+        file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
-                file_lock.fl_end = ~(u64)0;
-        else
-                file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1;
        nfs4_transform_lock_offset(&file_lock);
-        /* vfs_test_lock uses the struct file _only_ to resolve the inode.
-         * since LOCKT doesn't require an OPEN, and therefore a struct
-         * file may not exist, pass vfs_test_lock a struct file with
-         * only the dentry:inode set.
-         */
-        memset(&file, 0, sizeof (struct file));
-        file.f_path.dentry = cstate->current_fh.fh_dentry;
        status = nfs_ok;
-        error = vfs_test_lock(&file, &file_lock);
+        error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
        if (error) {
                status = nfserrno(error);
                goto out;
@@ -2906,10 +2929,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        file_lock.fl_lmops = &nfsd_posix_mng_ops;
        file_lock.fl_start = locku->lu_offset;
-        if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
+        file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
-                file_lock.fl_end = ~(u64)0;
-        else
-                file_lock.fl_end = locku->lu_offset + locku->lu_length - 1;
        nfs4_transform_lock_offset(&file_lock);
        /*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index afcdf4b76843..f65953be39c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfs/nfs4xdr.c
- *
 *  Server-side XDR for NFSv4
 *
 *  Copyright (c) 2002 The Regents of the University of Michigan.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77d7b8c531a6..3d93b2064ce5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
 static ssize_t write_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Svc] = write_svc,
        [NFSD_Add] = write_add,
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Getfd] = write_getfd,
        [NFSD_Getfs] = write_getfs,
        [NFSD_Fh] = write_filehandle,
-        [NFSD_FO_UnlockIP] = failover_unlock_ip,
+        [NFSD_FO_UnlockIP] = write_unlock_ip,
-        [NFSD_FO_UnlockFS] = failover_unlock_fs,
+        [NFSD_FO_UnlockFS] = write_unlock_fs,
        [NFSD_Threads] = write_threads,
        [NFSD_Pool_Threads] = write_pool_threads,
        [NFSD_Versions] = write_versions,
@@ -176,10 +175,24 @@ static const struct file_operations exports_operations = {
 /*----------------------------------------------------------------------------*/
 /*
 * payload - write methods
- * If the method has a response, the response should be put in buf,
- * and the length returned.  Otherwise return 0 or and -error.
 */
+/**
+ * write_svc - Start kernel's NFSD server
+ *
+ * Deprecated.  /proc/fs/nfsd/threads is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_svc
+ *                              svc_port:       port number of this
+ *                                              server's listener
+ *                              svc_nthreads:   number of threads to start
+ *                      size:   size in bytes of passed in nfsctl_svc
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_svc(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_svc *data;
@@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size)
        return nfsd_svc(data->svc_port, data->svc_nthreads);
 }
+/**
+ * write_add - Add or modify client entry in auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_client
+ *                              cl_ident:       '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client
+ *                              cl_naddr:       no. of items in cl_addrlist
+ *                              cl_addrlist:    array of client addresses
+ *                              cl_fhkeytype:   ignored
+ *                              cl_fhkeylen:    ignored
+ *                              cl_fhkey:       ignored
+ *                      size:   size in bytes of passed in nfsctl_client
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_add(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_client *data;
@@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size)
        return exp_addclient(data);
 }
+/**
+ * write_del - Remove client from auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_client
+ *                              cl_ident:       '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client
+ *                              cl_naddr:       ignored
+ *                              cl_addrlist:    ignored
+ *                              cl_fhkeytype:   ignored
+ *                              cl_fhkeylen:    ignored
+ *                              cl_fhkey:       ignored
+ *                      size:   size in bytes of passed in nfsctl_client
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_del(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_client *data;
@@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size)
        return exp_delclient(data);
 }
+/**
+ * write_export - Export part or all of a local file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_export
+ *                              ex_client:      '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client allowed to access
+ *                                              this export
+ *                              ex_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              ex_dev:         fsid to use for this export
+ *                              ex_ino:         ignored
+ *                              ex_flags:       export flags for this export
+ *                              ex_anon_uid:    UID to use for anonymous
+ *                                              requests
+ *                              ex_anon_gid:    GID to use for anonymous
+ *                                              requests
+ *                      size:   size in bytes of passed in nfsctl_export
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_export(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_export *data;
@@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size)
        return exp_export(data);
 }
+/**
+ * write_unexport - Unexport a previously exported file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_export
+ *                              ex_client:      '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client no longer allowed
+ *                                              to access this export
+ *                              ex_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              ex_dev:         ignored
+ *                              ex_ino:         ignored
+ *                              ex_flags:       ignored
+ *                              ex_anon_uid:    ignored
+ *                              ex_anon_gid:    ignored
+ *                      size:   size in bytes of passed in nfsctl_export
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_unexport(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_export *data;
@@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size)
        return exp_unexport(data);
 }
+/**
+ * write_getfs - Get a variable-length NFS file handle by path
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_fsparm
+ *                              gd_addr:        socket address of client
+ *                              gd_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              gd_maxlen:      maximum size of returned file
+ *                                              handle
+ *                      size:   size in bytes of passed in nfsctl_fsparm
+ * Output:
+ *      On success:     passed-in buffer filled with a knfsd_fh structure
+ *                      (a variable-length raw NFS file handle);
+ *                      return code is the size in bytes of the file handle
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfs(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_fsparm *data;
@@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
        return err;
 }
+/**
+ * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_fdparm
+ *                              gd_addr:        socket address of client
+ *                              gd_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              gd_version:     fdparm structure version
+ *                      size:   size in bytes of passed in nfsctl_fdparm
+ * Output:
+ *      On success:     passed-in buffer filled with nfsctl_res
+ *                      (a fixed-length raw NFS file handle);
+ *                      return code is the size in bytes of the file handle
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_fdparm *data;
@@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
        return err;
 }
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_ip - Release all locks used by a client
+ *
+ * Experimental.
+ *
+ * Input:
+ *                      buf:    '\n'-terminated C string containing a
+ *                              presentation format IPv4 address
+ *                      size:   length of C string in @buf
+ * Output:
+ *      On success:     returns zero if all specified locks were released;
+ *                      returns one if one or more locks were not released
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in
+ */
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 {
        struct sockaddr_in sin = {
                .sin_family     = AF_INET,
@@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
        return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
 }
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_fs - Release all locks on a local file system
+ *
+ * Experimental.
+ *
+ * Input:
+ *                      buf:    '\n'-terminated C string containing the
+ *                              absolute pathname of a local file system
+ *                      size:   length of C string in @buf
+ * Output:
+ *      On success:     returns zero if all specified locks were released;
+ *                      returns one if one or more locks were not released
+ *      On error:       return code is negative errno value
+ */
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 {
        struct path path;
        char *fo_path;
@@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
        if (error)
                return error;
+        /*
+         * XXX: Needs better sanity checking.  Otherwise we could end up
+         * releasing locks on the wrong file system.
+         *
+         * For example:
+         * 1.  Does the path refer to a directory?
+         * 2.  Is that directory a mount point, or
+         * 3.  Is that directory the root of an exported file system?
+         */
        error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
        path_put(&path);
        return error;
 }
+/**
+ * write_filehandle - Get a variable-length NFS file handle by path
+ *
+ * On input, the buffer contains a '\n'-terminated C string comprised of
+ * three alphanumeric words separated by whitespace.  The string may
+ * contain escape sequences.
+ *
+ * Input:
+ *                      buf:
+ *                              domain:         client domain name
+ *                              path:           export pathname
+ *                              maxsize:        numeric maximum size of
+ *                                              @buf
+ *                      size:   length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing a ASCII hex text version
+ *                      of the NFS file handle;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 {
-        /* request is:
-         *   domain path maxsize
-         * response is
-         *   filehandle
-         *
-         * qword quoting is used, so filehandle will be \x....
-         */
        char *dname, *path;
        int uninitialized_var(maxsize);
        char *mesg = buf;
@@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        dname = mesg;
        len = qword_get(&mesg, dname, size);
-        if (len <= 0) return -EINVAL;
+        if (len <= 0)
+                return -EINVAL;
        
        path = dname+len+1;
        len = qword_get(&mesg, path, size);
-        if (len <= 0) return -EINVAL;
+        if (len <= 0)
+                return -EINVAL;
        len = get_int(&mesg, &maxsize);
        if (len)
@@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        if (len)
                return len;
        
-        mesg = buf; len = SIMPLE_TRANSACTION_LIMIT;
+        mesg = buf;
+        len = SIMPLE_TRANSACTION_LIMIT;
        qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
        mesg[-1] = '\n';
        return mesg - buf;      
 }
+/**
+ * write_threads - Start NFSD, or report the current number of running threads
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string numeric value representing the number of
+ *                      running NFSD threads;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the
+ *                                      number of NFSD threads to start
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     NFS service is started;
+ *                      passed-in buffer filled with '\n'-terminated C
+ *                      string numeric value representing the number of
+ *                      running NFSD threads;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_threads(struct file *file, char *buf, size_t size)
 {
-        /* if size > 0, look for a number of threads and call nfsd_svc
-         * then write out number of threads as reply
-         */
        char *mesg = buf;
        int rv;
        if (size > 0) {
@@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
                rv = get_int(&mesg, &newthreads);
                if (rv)
                        return rv;
-                if (newthreads <0)
+                if (newthreads < 0)
                        return -EINVAL;
-                rv = nfsd_svc(2049, newthreads);
+                rv = nfsd_svc(NFS_PORT, newthreads);
                if (rv)
                        return rv;
        }
@@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
+/**
+ * write_pool_threads - Set or report the current number of threads per pool
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing whitespace-
+ *                                      separated unsigned integer values
+ *                                      representing the number of NFSD
+ *                                      threads to start in each pool
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing integer values representing the
+ *                      number of NFSD threads in each pool;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 {
        /* if size > 0, look for an array of number of threads per node
@@ -517,10 +780,6 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
-        /*
-         * Format:
-         *   [-/+]vers [-/+]vers ...
-         */
        char *mesg = buf;
        char *vers, sign;
        int len, num;
@@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
        return len;
 }
+/**
+ * write_versions - Set or report the available NFS protocol versions
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing positive or negative integer
+ *                      values representing the current status of each
+ *                      protocol version;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing whitespace-
+ *                                      separated positive or negative
+ *                                      integer values representing NFS
+ *                                      protocol versions to enable ("+n")
+ *                                      or disable ("-n")
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     status of zero or more protocol versions has
+ *                      been updated; passed-in buffer filled with
+ *                      '\n'-terminated C string containing positive
+ *                      or negative integer values representing the
+ *                      current status of each protocol version;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_versions(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
@@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
        return -EINVAL;
 }
+/**
+ * write_ports - Pass a socket file descriptor or transport name to listen on
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * Output:
+ *      On success:     passed-in buffer filled with a '\n'-terminated C
+ *                      string containing a whitespace-separated list of
+ *                      named NFSD listeners;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing a bound
+ *                                      but unconnected socket that is to be
+ *                                      used as an NFSD listener
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     NFS service is started;
+ *                      passed-in buffer filled with a '\n'-terminated C
+ *                      string containing a unique alphanumeric name of
+ *                      the listener;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing a "-" followed
+ *                                      by an integer value representing a
+ *                                      previously passed in socket file
+ *                                      descriptor
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     NFS service no longer listens on that socket;
+ *                      passed-in buffer filled with a '\n'-terminated C
+ *                      string containing a unique name of the listener;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing a transport
+ *                                      name and an unsigned integer value
+ *                                      representing the port to listen on,
+ *                                      separated by whitespace
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     returns zero; NFS service is started
+ *      On error:       return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing a "-" followed
+ *                                      by a transport name and an unsigned
+ *                                      integer value representing the port
+ *                                      to listen on, separated by whitespace
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     returns zero; NFS service no longer listens
+ *                      on that transport
+ *      On error:       return code is a negative errno value
+ */
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
@@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 int nfsd_max_blksize;
+/**
+ * write_maxblksize - Set or report the current NFS blksize
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the new
+ *                                      NFS blksize
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C string
+ *                      containing numeric value of the current NFS blksize
+ *                      setting;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
@@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
+/**
+ * write_leasetime - Set or report the current NFSv4 lease time
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the new
+ *                                      NFSv4 lease expiry time
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing unsigned integer value of the
+ *                      current lease expiry time;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
@@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
+/**
+ * write_recoverydir - Set or report the pathname of the recovery directory
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing the pathname
+ *                                      of the directory on a local file
+ *                                      system containing permanent NFSv4
+ *                                      recovery data
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C string
+ *                      containing the current recovery pathname setting;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f0da7d9c3a92..9f1ca17293d3 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -258,14 +258,32 @@ out:
        return error;
 }
-/*
+/**
- * Perform sanity checks on the dentry in a client's file handle.
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
 *
- * Note that the file handle dentry may need to be freed even after
+ * fh_verify() may be called multiple times on a given filehandle, for
- * an error return.
+ * example, when processing an NFSv4 compound.  The first call will look
+ * up a dentry using the on-the-wire filehandle.  Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
 *
- * This is only called at the start of an nfsproc call, so fhp points to
+ * @type specifies the type of object expected using one of the S_IF*
- * a svc_fh which is all 0 except for the over-the-wire file handle.
+ * constants defined in include/linux/stat.h.  The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
+ *
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * include/linux/nfsd/nfsd.h.
 */
 __be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
@@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                                goto retry;
                        break;
                }
+        } else if (exp->ex_flags & NFSEXP_FSID) {
+                fsid_type = FSID_NUM;
        } else if (exp->ex_uuid) {
                if (fhp->fh_maxsize >= 64) {
                        if (root_export)
@@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                        else
                                fsid_type = FSID_UUID4_INUM;
                }
-        } else if (exp->ex_flags & NFSEXP_FSID)
+        } else if (!old_valid_dev(ex_dev))
-                fsid_type = FSID_NUM;
-        else if (!old_valid_dev(ex_dev))
                /* for newer device numbers, we must use a newer fsid format */
                fsid_type = FSID_ENCODE_DEV;
        else
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cffeca7acef..6f7f26351227 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -622,6 +622,7 @@ nfserrno (int errno)
                { nfserr_badname, -ESRCH },
                { nfserr_io, -ETXTBSY },
                { nfserr_notsupp, -EOPNOTSUPP },
+                { nfserr_toosmall, -ETOOSMALL },
        };
        int     i;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b365..6e50aaa56ca2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
        return err;
 }
-        
 static int
 nfsd_sync(struct file *filp)
@@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        dirp = dentry->d_inode;
        err = nfserr_notdir;
-        if(!dirp->i_op || !dirp->i_op->lookup)
+        if (!dirp->i_op->lookup)
                goto out;
        /*
         * Check whether the response file handle has been verified yet.
@@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        /* Get all the sanity checks out of the way before
         * we lock the parent. */
        err = nfserr_notdir;
-        if(!dirp->i_op || !dirp->i_op->lookup)
+        if (!dirp->i_op->lookup)
                goto out;
        fh_lock_nested(fhp, I_MUTEX_PARENT);
@@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
        inode = dentry->d_inode;
        err = nfserr_inval;
-        if (!inode->i_op || !inode->i_op->readlink)
+        if (!inode->i_op->readlink)
                goto out;
        touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
        size_t size;
        int error;
-        if (!IS_POSIXACL(inode) || !inode->i_op ||
+        if (!IS_POSIXACL(inode) ||
            !inode->i_op->setxattr || !inode->i_op->removexattr)
                return -EOPNOTSUPP;
        switch(type) {
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
new file mode 100644
index 000000000000..50914d7303c6
--- /dev/null
+++ b/fs/notify/Kconfig
@@ -0,0 +1,2 @@
+source "fs/notify/dnotify/Kconfig"
+source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
new file mode 100644
index 000000000000..5a95b6010ce7
--- /dev/null
+++ b/fs/notify/Makefile
@@ -0,0 +1,2 @@
+obj-y                   += dnotify/
+obj-y                   += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
new file mode 100644
index 000000000000..26adf5dfa646
--- /dev/null
+++ b/fs/notify/dnotify/Kconfig
@@ -0,0 +1,10 @@
+config DNOTIFY
+        bool "Dnotify support"
+        default y
+        help
+          Dnotify is a directory-based per-fd file change notification system
+          that uses signals to communicate events to user-space.  There exist
+          superior alternatives, but some applications may still rely on
+          dnotify.
+          If unsure, say Y.
diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
new file mode 100644
index 000000000000..f145251dcadb
--- /dev/null
+++ b/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DNOTIFY)           += dnotify.o
diff --git a/fs/dnotify.c b/fs/notify/dnotify/dnotify.c
index 676073b8dda5..b0aa2cde80bd 100644
--- a/fs/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        dn->dn_next = inode->i_dnotify;
        inode->i_dnotify = dn;
        spin_unlock(&inode->i_lock);
-        if (filp->f_op && filp->f_op->dir_notify)
-                return filp->f_op->dir_notify(filp, arg);
        return 0;
 out_free:
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
new file mode 100644
index 000000000000..446792841023
--- /dev/null
+++ b/fs/notify/inotify/Kconfig
@@ -0,0 +1,27 @@
+config INOTIFY
+        bool "Inotify file change notification support"
+        default y
+        ---help---
+          Say Y here to enable inotify support.  Inotify is a file change
+          notification system and a replacement for dnotify.  Inotify fixes
+          numerous shortcomings in dnotify and introduces several new features
+          including multiple file events, one-shot support, and unmount
+          notification.
+          For more information, see <file:Documentation/filesystems/inotify.txt>
+          If unsure, say Y.
+config INOTIFY_USER
+        bool "Inotify support for userspace"
+        depends on INOTIFY
+        default y
+        ---help---
+          Say Y here to enable inotify support for userspace, including the
+          associated system calls.  Inotify allows monitoring of both files and
+          directories via a single open fd.  Events are read from the file
+          descriptor, which is also select()- and poll()-able.
+          For more information, see <file:Documentation/filesystems/inotify.txt>
+          If unsure, say Y.
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
new file mode 100644
index 000000000000..e290f3bb9d8d
--- /dev/null
+++ b/fs/notify/inotify/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INOTIFY)           += inotify.o
+obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
diff --git a/fs/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..dae3f28f30d4 100644
--- a/fs/inotify.c
+++ b/fs/notify/inotify/inotify.c
diff --git a/fs/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e2425bbd871f..bed766e435b5 100644
--- a/fs/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -76,10 +76,10 @@ struct inotify_device {
        struct mutex            ev_mutex;       /* protects event queue */
        struct mutex            up_mutex;       /* synchronizes watch updates */
        struct list_head        events;         /* list of queued events */
-        atomic_t                count;          /* reference count */
        struct user_struct      *user;          /* user who opened this dev */
        struct inotify_handle   *ih;            /* inotify handle */
        struct fasync_struct    *fa;            /* async notification */
+        atomic_t                count;          /* reference count */
        unsigned int            queue_size;     /* size of the queue (bytes) */
        unsigned int            event_count;    /* number of pending events */
        unsigned int            max_events;     /* maximum number of events */
@@ -427,10 +427,61 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
        return ret;
 }
+/*
+ * Get an inotify_kernel_event if one exists and is small
+ * enough to fit in "count". Return an error pointer if
+ * not large enough.
+ *
+ * Called with the device ev_mutex held.
+ */
+static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
+                                                  size_t count)
+{
+        size_t event_size = sizeof(struct inotify_event);
+        struct inotify_kernel_event *kevent;
+        if (list_empty(&dev->events))
+                return NULL;
+        kevent = inotify_dev_get_event(dev);
+        if (kevent->name)
+                event_size += kevent->event.len;
+        if (event_size > count)
+                return ERR_PTR(-EINVAL);
+        remove_kevent(dev, kevent);
+        return kevent;
+}
+/*
+ * Copy an event to user space, returning how much we copied.
+ *
+ * We already checked that the event size is smaller than the
+ * buffer we had in "get_one_event()" above.
+ */
+static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+                                  char __user *buf)
+{
+        size_t event_size = sizeof(struct inotify_event);
+        if (copy_to_user(buf, &kevent->event, event_size))
+                return -EFAULT;
+        if (kevent->name) {
+                buf += event_size;
+                if (copy_to_user(buf, kevent->name, kevent->event.len))
+                        return -EFAULT;
+                event_size += kevent->event.len;
+        }
+        return event_size;
+}
 static ssize_t inotify_read(struct file *file, char __user *buf,
                            size_t count, loff_t *pos)
 {
-        size_t event_size = sizeof (struct inotify_event);
        struct inotify_device *dev;
        char __user *start;
        int ret;
@@ -440,81 +491,43 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
        dev = file->private_data;
        while (1) {
+                struct inotify_kernel_event *kevent;
                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
                mutex_lock(&dev->ev_mutex);
-                if (!list_empty(&dev->events)) {
+                kevent = get_one_event(dev, count);
-                        ret = 0;
-                        break;
-                }
                mutex_unlock(&dev->ev_mutex);
-                if (file->f_flags & O_NONBLOCK) {
+                if (kevent) {
-                        ret = -EAGAIN;
+                        ret = PTR_ERR(kevent);
-                        break;
+                        if (IS_ERR(kevent))
-                }
+                                break;
+                        ret = copy_event_to_user(kevent, buf);
-                if (signal_pending(current)) {
+                        free_kevent(kevent);
-                        ret = -EINTR;
+                        if (ret < 0)
-                        break;
+                                break;
+                        buf += ret;
+                        count -= ret;
+                        continue;
                }
-                schedule();
+                ret = -EAGAIN;
-        }
+                if (file->f_flags & O_NONBLOCK)
-        finish_wait(&dev->wq, &wait);
-        if (ret)
-                return ret;
-        while (1) {
-                struct inotify_kernel_event *kevent;
-                ret = buf - start;
-                if (list_empty(&dev->events))
                        break;
+                ret = -EINTR;
-                kevent = inotify_dev_get_event(dev);
+                if (signal_pending(current))
-                if (event_size + kevent->event.len > count) {
-                        if (ret == 0 && count > 0) {
-                                /*
-                                 * could not get a single event because we
-                                 * didn't have enough buffer space.
-                                 */
-                                ret = -EINVAL;
-                        }
                        break;
-                }
-                remove_kevent(dev, kevent);
-                /*
+                if (start != buf)
-                 * Must perform the copy_to_user outside the mutex in order
-                 * to avoid a lock order reversal with mmap_sem.
-                 */
-                mutex_unlock(&dev->ev_mutex);
-                if (copy_to_user(buf, &kevent->event, event_size)) {
-                        ret = -EFAULT;
                        break;
-                }
-                buf += event_size;
-                count -= event_size;
-                if (kevent->name) {
-                        if (copy_to_user(buf, kevent->name, kevent->event.len)){
-                                ret = -EFAULT;
-                                break;
-                        }
-                        buf += kevent->event.len;
-                        count -= kevent->event.len;
-                }
-                free_kevent(kevent);
-                mutex_lock(&dev->ev_mutex);
+                schedule();
        }
-        mutex_unlock(&dev->ev_mutex);
+        finish_wait(&dev->wq, &wait);
+        if (start != buf && ret != -EFAULT)
+                ret = buf - start;
        return ret;
 }
@@ -576,7 +589,7 @@ static const struct inotify_operations inotify_user_ops = {
        .destroy_watch  = free_inotify_user_watch,
 };
-asmlinkage long sys_inotify_init1(int flags)
+SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
        struct inotify_device *dev;
        struct inotify_handle *ih;
@@ -655,12 +668,13 @@ out_put_fd:
        return ret;
 }
-asmlinkage long sys_inotify_init(void)
+SYSCALL_DEFINE0(inotify_init)
 {
        return sys_inotify_init1(0);
 }
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
+SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
+                u32, mask)
 {
        struct inode *inode;
        struct inotify_device *dev;
@@ -704,7 +718,7 @@ fput_and_out:
        return ret;
 }
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
        struct file *filp;
        struct inotify_device *dev;
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
new file mode 100644
index 000000000000..f5a868cc9152
--- /dev/null
+++ b/fs/ntfs/Kconfig
@@ -0,0 +1,78 @@
+config NTFS_FS
+        tristate "NTFS file system support"
+        select NLS
+        help
+          NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
+          Saying Y or M here enables read support.  There is partial, but
+          safe, write support available.  For write support you must also
+          say Y to "NTFS write support" below.
+          There are also a number of user-space tools available, called
+          ntfsprogs.  These include ntfsundelete and ntfsresize, that work
+          without NTFS support enabled in the kernel.
+          This is a rewrite from scratch of Linux NTFS support and replaced
+          the old NTFS code starting with Linux 2.5.11.  A backport to
+          the Linux 2.4 kernel series is separately available as a patch
+          from the project web site.
+          For more information see <file:Documentation/filesystems/ntfs.txt>
+          and <http://www.linux-ntfs.org/>.
+          To compile this file system support as a module, choose M here: the
+          module will be called ntfs.
+          If you are not using Windows NT, 2000, XP or 2003 in addition to
+          Linux on your computer it is safe to say N.
+config NTFS_DEBUG
+        bool "NTFS debugging support"
+        depends on NTFS_FS
+        help
+          If you are experiencing any problems with the NTFS file system, say
+          Y here.  This will result in additional consistency checks to be
+          performed by the driver as well as additional debugging messages to
+          be written to the system log.  Note that debugging messages are
+          disabled by default.  To enable them, supply the option debug_msgs=1
+          at the kernel command line when booting the kernel or as an option
+          to insmod when loading the ntfs module.  Once the driver is active,
+          you can enable debugging messages by doing (as root):
+          echo 1 > /proc/sys/fs/ntfs-debug
+          Replacing the "1" with "0" would disable debug messages.
+          If you leave debugging messages disabled, this results in little
+          overhead, but enabling debug messages results in very significant
+          slowdown of the system.
+          When reporting bugs, please try to have available a full dump of
+          debugging messages while the misbehaviour was occurring.
+config NTFS_RW
+        bool "NTFS write support"
+        depends on NTFS_FS
+        help
+          This enables the partial, but safe, write support in the NTFS driver.
+          The only supported operation is overwriting existing files, without
+          changing the file length.  No file or directory creation, deletion or
+          renaming is possible.  Note only non-resident files can be written to
+          so you may find that some very small files (<500 bytes or so) cannot
+          be written to.
+          While we cannot guarantee that it will not damage any data, we have
+          so far not received a single report where the driver would have
+          damaged someones data so we assume it is perfectly safe to use.
+          Note:  While write support is safe in this version (a rewrite from
+          scratch of the NTFS support), it should be noted that the old NTFS
+          write support, included in Linux 2.5.10 and before (since 1997),
+          is not safe.
+          This is currently useful with TopologiLinux.  TopologiLinux is run
+          on top of any DOS/Microsoft Windows system without partitioning your
+          hard disk.  Unlike other Linux distributions TopologiLinux does not
+          need its own partition.  For more information see
+          <http://topologi-linux.sourceforge.net/>
+          It is perfectly safe to say N here.
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e2772..86bef156cf0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
                ni->allocated_size = sle64_to_cpu(
                                a->data.non_resident.allocated_size);
        }
-        /* Setup the operations for this attribute inode. */
-        vi->i_op = NULL;
-        vi->i_fop = NULL;
        if (NInoMstProtected(ni))
                vi->i_mapping->a_ops = &ntfs_mst_aops;
        else
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
new file mode 100644
index 000000000000..701b7a3a872e
--- /dev/null
+++ b/fs/ocfs2/Kconfig
@@ -0,0 +1,85 @@
+config OCFS2_FS
+        tristate "OCFS2 file system support"
+        depends on NET && SYSFS
+        select CONFIGFS_FS
+        select JBD2
+        select CRC32
+        select QUOTA
+        select QUOTA_TREE
+        help
+          OCFS2 is a general purpose extent based shared disk cluster file
+          system with many similarities to ext3. It supports 64 bit inode
+          numbers, and has automatically extending metadata groups which may
+          also make it attractive for non-clustered use.
+          You'll want to install the ocfs2-tools package in order to at least
+          get "mount.ocfs2".
+          Project web page:    http://oss.oracle.com/projects/ocfs2
+          Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+          OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+          For more information on OCFS2, see the file
+          <file:Documentation/filesystems/ocfs2.txt>.
+config OCFS2_FS_O2CB
+        tristate "O2CB Kernelspace Clustering"
+        depends on OCFS2_FS
+        default y
+        help
+          OCFS2 includes a simple kernelspace clustering package, the OCFS2
+          Cluster Base.  It only requires a very small userspace component
+          to configure it. This comes with the standard ocfs2-tools package.
+          O2CB is limited to maintaining a cluster for OCFS2 file systems.
+          It cannot manage any other cluster applications.
+          It is always safe to say Y here, as the clustering method is
+          run-time selectable.
+config OCFS2_FS_USERSPACE_CLUSTER
+        tristate "OCFS2 Userspace Clustering"
+        depends on OCFS2_FS && DLM
+        default y
+        help
+          This option will allow OCFS2 to use userspace clustering services
+          in conjunction with the DLM in fs/dlm.  If you are using a
+          userspace cluster manager, say Y here.
+          It is safe to say Y, as the clustering method is run-time
+          selectable.
+config OCFS2_FS_STATS
+        bool "OCFS2 statistics"
+        depends on OCFS2_FS
+        default y
+        help
+          This option allows some fs statistics to be captured. Enabling
+          this option may increase the memory consumption.
+config OCFS2_DEBUG_MASKLOG
+        bool "OCFS2 logging support"
+        depends on OCFS2_FS
+        default y
+        help
+          The ocfs2 filesystem has an extensive logging system.  The system
+          allows selection of events to log via files in /sys/o2cb/logmask/.
+          This option will enlarge your kernel, but it allows debugging of
+          ocfs2 filesystem issues.
+config OCFS2_DEBUG_FS
+        bool "OCFS2 expensive checks"
+        depends on OCFS2_FS
+        default n
+        help
+          This option will enable expensive consistency checks. Enable
+          this option for debugging only as it is likely to decrease
+          performance of the filesystem.
+config OCFS2_FS_POSIX_ACL
+        bool "OCFS2 POSIX Access Control Lists"
+        depends on OCFS2_FS
+        select FS_POSIX_ACL
+        default n
+        help
+          Posix Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3c..01596079dd63 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
        alloc.o                 \
        aops.o                  \
+        blockcheck.o            \
        buffer_head_io.o        \
        dcache.o                \
        dir.o                   \
@@ -35,8 +36,14 @@ ocfs2-objs := \
        sysfile.o               \
        uptodate.o              \
        ver.o                   \
+        quota_local.o           \
+        quota_global.o          \
        xattr.o
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 000000000000..12dfb44c22e5
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+#include "xattr.h"
+#include "acl.h"
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+        int n, count;
+        struct posix_acl *acl;
+        if (!value)
+                return NULL;
+        if (size < sizeof(struct posix_acl_entry))
+                return ERR_PTR(-EINVAL);
+        count = size / sizeof(struct posix_acl_entry);
+        if (count < 0)
+                return ERR_PTR(-EINVAL);
+        if (count == 0)
+                return NULL;
+        acl = posix_acl_alloc(count, GFP_NOFS);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        for (n = 0; n < count; n++) {
+                struct ocfs2_acl_entry *entry =
+                        (struct ocfs2_acl_entry *)value;
+                acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+                acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+                acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+                value += sizeof(struct posix_acl_entry);
+        }
+        return acl;
+}
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+        struct ocfs2_acl_entry *entry = NULL;
+        char *ocfs2_acl;
+        size_t n;
+        *size = acl->a_count * sizeof(struct posix_acl_entry);
+        ocfs2_acl = kmalloc(*size, GFP_NOFS);
+        if (!ocfs2_acl)
+                return ERR_PTR(-ENOMEM);
+        entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+        for (n = 0; n < acl->a_count; n++, entry++) {
+                entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+                entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+                entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+        }
+        return ocfs2_acl;
+}
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+                                              int type,
+                                              struct buffer_head *di_bh)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int name_index;
+        char *value = NULL;
+        struct posix_acl *acl;
+        int retval;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+        if (retval > 0) {
+                value = kmalloc(retval, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+                                                "", value, retval);
+        }
+        if (retval > 0)
+                acl = ocfs2_acl_from_xattr(value, retval);
+        else if (retval == -ENODATA || retval == 0)
+                acl = NULL;
+        else
+                acl = ERR_PTR(retval);
+        kfree(value);
+        return acl;
+}
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct posix_acl *acl;
+        int ret;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                acl = ERR_PTR(ret);
+                return acl;
+        }
+        acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+        ocfs2_inode_unlock(inode, 0);
+        brelse(di_bh);
+        return acl;
+}
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+                         struct inode *inode,
+                         struct buffer_head *di_bh,
+                         int type,
+                         struct posix_acl *acl,
+                         struct ocfs2_alloc_context *meta_ac,
+                         struct ocfs2_alloc_context *data_ac)
+{
+        int name_index;
+        void *value = NULL;
+        size_t size = 0;
+        int ret;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                if (acl) {
+                        mode_t mode = inode->i_mode;
+                        ret = posix_acl_equiv_mode(acl, &mode);
+                        if (ret < 0)
+                                return ret;
+                        else {
+                                inode->i_mode = mode;
+                                if (ret == 0)
+                                        acl = NULL;
+                        }
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EACCES : 0;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                value = ocfs2_acl_to_xattr(acl, &size);
+                if (IS_ERR(value))
+                        return (int)PTR_ERR(value);
+        }
+        if (handle)
+                ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+                                             "", value, size, 0,
+                                             meta_ac, data_ac);
+        else
+                ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+        kfree(value);
+        return ret;
+}
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                int ret = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return ret;
+        }
+        return -EAGAIN;
+}
+int ocfs2_acl_chmod(struct inode *inode)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl, *clone;
+        int ret;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        ret = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!ret)
+                ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+                                    clone, NULL, NULL);
+        posix_acl_release(clone);
+        return ret;
+}
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+                   struct inode *inode,
+                   struct inode *dir,
+                   struct buffer_head *di_bh,
+                   struct buffer_head *dir_bh,
+                   struct ocfs2_alloc_context *meta_ac,
+                   struct ocfs2_alloc_context *data_ac)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl = NULL;
+        int ret = 0;
+        if (!S_ISLNK(inode->i_mode)) {
+                if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                        acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+                                                   dir_bh);
+                        if (IS_ERR(acl))
+                                return PTR_ERR(acl);
+                }
+                if (!acl)
+                        inode->i_mode &= ~current->fs->umask;
+        }
+        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+                struct posix_acl *clone;
+                mode_t mode;
+                if (S_ISDIR(inode->i_mode)) {
+                        ret = ocfs2_set_acl(handle, inode, di_bh,
+                                            ACL_TYPE_DEFAULT, acl,
+                                            meta_ac, data_ac);
+                        if (ret)
+                                goto cleanup;
+                }
+                clone = posix_acl_clone(acl, GFP_NOFS);
+                ret = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                mode = inode->i_mode;
+                ret = posix_acl_create_masq(clone, &mode);
+                if (ret >= 0) {
+                        inode->i_mode = mode;
+                        if (ret > 0) {
+                                ret = ocfs2_set_acl(handle, inode,
+                                                    di_bh, ACL_TYPE_ACCESS,
+                                                    clone, meta_ac, data_ac);
+                        }
+                }
+                posix_acl_release(clone);
+        }
+cleanup:
+        posix_acl_release(acl);
+        return ret;
+}
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+                                          char *list,
+                                          size_t list_len,
+                                          const char *name,
+                                          size_t name_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+        return size;
+}
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+                                           char *list,
+                                           size_t list_len,
+                                           const char *name,
+                                           size_t name_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+        return size;
+}
+static int ocfs2_xattr_get_acl(struct inode *inode,
+                               int type,
+                               void *buffer,
+                               size_t size)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl;
+        int ret;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return -EOPNOTSUPP;
+        acl = ocfs2_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        ret = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return ret;
+}
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+                                      const char *name,
+                                      void *buffer,
+                                      size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+                                       const char *name,
+                                       void *buffer,
+                                       size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+static int ocfs2_xattr_set_acl(struct inode *inode,
+                               int type,
+                               const void *value,
+                               size_t size)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl;
+        int ret = 0;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EPERM;
+        if (value) {
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                else if (acl) {
+                        ret = posix_acl_valid(acl);
+                        if (ret)
+                                goto cleanup;
+                }
+        } else
+                acl = NULL;
+        ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+cleanup:
+        posix_acl_release(acl);
+        return ret;
+}
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+                                      const char *name,
+                                      const void *value,
+                                      size_t size,
+                                      int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+                                       const char *name,
+                                       const void *value,
+                                       size_t size,
+                                       int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .list   = ocfs2_xattr_list_acl_access,
+        .get    = ocfs2_xattr_get_acl_access,
+        .set    = ocfs2_xattr_set_acl_access,
+};
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .list   = ocfs2_xattr_list_acl_default,
+        .get    = ocfs2_xattr_get_acl_default,
+        .set    = ocfs2_xattr_set_acl_default,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 000000000000..8f6389ed4da5
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+#include <linux/posix_acl_xattr.h>
+struct ocfs2_acl_entry {
+        __le16 e_tag;
+        __le16 e_perm;
+        __le32 e_id;
+};
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+                          struct buffer_head *, struct buffer_head *,
+                          struct ocfs2_alloc_context *,
+                          struct ocfs2_alloc_context *);
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+#define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+static inline int ocfs2_init_acl(handle_t *handle,
+                                 struct inode *inode,
+                                 struct inode *dir,
+                                 struct buffer_head *di_bh,
+                                 struct buffer_head *dir_bh,
+                                 struct ocfs2_alloc_context *meta_ac,
+                                 struct ocfs2_alloc_context *data_ac)
+{
+        return 0;
+}
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394c..60fe74035db5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -36,6 +37,7 @@
 #include "alloc.h"
 #include "aops.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -46,6 +48,7 @@
 #include "file.h"
 #include "super.h"
 #include "uptodate.h"
+#include "xattr.h"
 #include "buffer_head_io.h"
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
 static int ocfs2_dinode_sanity_check(struct inode *inode,
                                     struct ocfs2_extent_tree *et)
 {
-        int ret = 0;
+        struct ocfs2_dinode *di = et->et_object;
-        struct ocfs2_dinode *di;
        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
-        di = et->et_object;
+        return 0;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                ret = -EIO;
-                ocfs2_error(inode->i_sb,
-                        "Inode %llu has invalid path root",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        }
-        return ret;
 }
 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 {
-        struct ocfs2_xattr_value_root *xv = et->et_object;
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-        et->et_root_el = &xv->xr_list;
+        et->et_root_el = &vb->vb_xv->xr_list;
 }
 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
                                              u64 blkno)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *)et->et_object;
-        xv->xr_last_eb_blk = cpu_to_le64(blkno);
+        vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
 }
 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *) et->et_object;
-        return le64_to_cpu(xv->xr_last_eb_blk);
+        return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
 }
 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
                                              struct ocfs2_extent_tree *et,
                                              u32 clusters)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *)et->et_object;
-        le32_add_cpu(&xv->xr_clusters, clusters);
+        le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 }
 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
                                     struct inode *inode,
                                     struct buffer_head *bh,
+                                     ocfs2_journal_access_func access,
                                     void *obj,
                                     struct ocfs2_extent_tree_operations *ops)
 {
        et->et_ops = ops;
        et->et_root_bh = bh;
+        et->et_root_journal_access = access;
        if (!obj)
                obj = (void *)bh->b_data;
        et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
                                   struct inode *inode,
                                   struct buffer_head *bh)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+                                 NULL, &ocfs2_dinode_et_ops);
 }
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, NULL,
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
-                                 &ocfs2_xattr_tree_et_ops);
+                                 NULL, &ocfs2_xattr_tree_et_ops);
 }
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_buf *vb)
-                                        struct ocfs2_xattr_value_root *xv)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, xv,
+        __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
                                 &ocfs2_xattr_value_et_ops);
 }
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
        et->et_ops->eo_update_clusters(inode, et, clusters);
 }
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+                                               struct inode *inode,
+                                               struct ocfs2_extent_tree *et,
+                                               int type)
+{
+        return et->et_root_journal_access(handle, inode, et->et_root_bh,
+                                          type);
+}
 static inline int ocfs2_et_insert_check(struct inode *inode,
                                        struct ocfs2_extent_tree *et,
                                        struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
 #define OCFS2_MAX_PATH_DEPTH    5
 struct ocfs2_path {
-        int                     p_tree_depth;
+        int                             p_tree_depth;
-        struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
+        ocfs2_journal_access_func       p_root_access;
+        struct ocfs2_path_item          p_node[OCFS2_MAX_PATH_DEPTH];
 };
 #define path_root_bh(_path) ((_path)->p_node[0].bh)
 #define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
         */
        if (keep_root)
                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+        else
+                path_root_access(path) = NULL;
        path->p_tree_depth = depth;
 }
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
        BUG_ON(path_root_el(dest) != path_root_el(src));
+        BUG_ON(path_root_access(dest) != path_root_access(src));
        ocfs2_reinit_path(dest, 1);
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
        int i;
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+        BUG_ON(path_root_access(dest) != path_root_access(src));
        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
                brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
 }
 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
-                                         struct ocfs2_extent_list *root_el)
+                                         struct ocfs2_extent_list *root_el,
+                                         ocfs2_journal_access_func access)
 {
        struct ocfs2_path *path;
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
                get_bh(root_bh);
                path_root_bh(path) = root_bh;
                path_root_el(path) = root_el;
+                path_root_access(path) = access;
        }
        return path;
 }
+static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+        return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+                              path_root_access(path));
+}
+static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+        return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+                              et->et_root_journal_access);
+}
+/*
+ * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+static int ocfs2_path_bh_journal_access(handle_t *handle,
+                                        struct inode *inode,
+                                        struct ocfs2_path *path,
+                                        int idx)
+{
+        ocfs2_journal_access_func access = path_root_access(path);
+        if (!access)
+                access = ocfs2_journal_access;
+        if (idx)
+                access = ocfs2_journal_access_eb;
+        return access(handle, inode, path->p_node[idx].bh,
+                      OCFS2_JOURNAL_ACCESS_WRITE);
+}
 /*
 * Convenience function to journal all components in a path.
 */
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
                goto out;
        for(i = 0; i < path_num_items(path); i++) {
-                ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
        int                     c_split_covers_rec;
 };
+static int ocfs2_validate_extent_block(struct super_block *sb,
+                                       struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_extent_block *eb =
+                (struct ocfs2_extent_block *)bh->b_data;
+        mlog(0, "Validating extent block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+        if (rc) {
+                mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return rc;
+        }
+        /*
+         * Errors after here are fatal.
+         */
+        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has bad signature %.*s",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            eb->h_signature);
+                return -EINVAL;
+        }
+        if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has an invalid h_blkno "
+                            "of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(eb->h_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has an invalid "
+                            "h_fs_generation of #%u",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(eb->h_fs_generation));
+                return -EINVAL;
+        }
+        return 0;
+}
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                            struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+                              ocfs2_validate_extent_block);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 /*
 * How many free extents have we got before we need more meta data?
 */
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
        last_eb_blk = ocfs2_et_get_last_eb_blk(et);
        if (last_eb_blk) {
-                retval = ocfs2_read_block(inode, last_eb_blk,
+                retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
-                                          &eb_bh);
                if (retval < 0) {
                        mlog_errno(retval);
                        goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
                        }
                        ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-                        status = ocfs2_journal_access(handle, inode, bhs[i],
+                        status = ocfs2_journal_access_eb(handle, inode, bhs[i],
-                                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                                         OCFS2_JOURNAL_ACCESS_CREATE);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        for(i = 0; i < new_blocks; i++) {
                bh = new_eb_bhs[i];
                eb = (struct ocfs2_extent_block *) bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                /* ocfs2_create_new_meta_bhs() should create it right! */
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-                        status = -EIO;
-                        goto bail;
-                }
                eb_el = &eb->h_list;
-                status = ocfs2_journal_access(handle, inode, bh,
+                status = ocfs2_journal_access_eb(handle, inode, bh,
-                                              OCFS2_JOURNAL_ACCESS_CREATE);
+                                                 OCFS2_JOURNAL_ACCESS_CREATE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * journal_dirty erroring as it won't unless we've aborted the
         * handle (in which case we would never be here) so reserving
         * the write with journal_access is all we need to do. */
-        status = ocfs2_journal_access(handle, inode, *last_eb_bh,
+        status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        if (eb_bh) {
-                status = ocfs2_journal_access(handle, inode, eb_bh,
+                status = ocfs2_journal_access_eb(handle, inode, eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        }
        eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+        /* ocfs2_create_new_meta_bhs() should create it right! */
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-                status = -EIO;
-                goto bail;
-        }
        eb_el = &eb->h_list;
        root_el = et->et_root_el;
-        status = ocfs2_journal_access(handle, inode, new_eb_bh,
+        status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
                brelse(bh);
                bh = NULL;
-                status = ocfs2_read_block(inode, blkno, &bh);
+                status = ocfs2_read_extent_block(inode, blkno, &bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        status = -EIO;
-                        goto bail;
-                }
                el = &eb->h_list;
                if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
                brelse(bh);
                bh = NULL;
-                ret = ocfs2_read_block(inode, blkno, &bh);
+                ret = ocfs2_read_extent_block(inode, blkno, &bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
                eb = (struct ocfs2_extent_block *) bh->b_data;
                el = &eb->h_list;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EIO;
-                        goto out;
-                }
                if (le16_to_cpu(el->l_next_free_rec) >
                    le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
        root_bh = left_path->p_node[subtree_index].bh;
        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           right_path->p_node[i].bh,
+                                                   right_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           left_path->p_node[i].bh,
+                                                   left_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
        *ret_left_path = NULL;
-        left_path = ocfs2_new_path(path_root_bh(right_path),
+        left_path = ocfs2_new_path_from_path(right_path);
-                                   path_root_el(right_path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                        return -EAGAIN;
                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_journal_access_eb(handle, inode,
-                                                   path_leaf_bh(right_path),
+                                                      path_leaf_bh(right_path),
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                      OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                 * We have to update i_last_eb_blk during the meta
                 * data delete.
                 */
-                ret = ocfs2_journal_access(handle, inode, et_root_bh,
+                ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
         */
        BUG_ON(right_has_empty && !del_right_subtree);
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           right_path->p_node[i].bh,
+                                                   right_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           left_path->p_node[i].bh,
+                                                   left_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2596,16 +2694,17 @@ out:
 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
                                            handle_t *handle,
-                                            struct buffer_head *bh,
+                                            struct ocfs2_path *path)
-                                            struct ocfs2_extent_list *el)
 {
        int ret;
+        struct buffer_head *bh = path_leaf_bh(path);
+        struct ocfs2_extent_list *el = path_leaf_el(path);
        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
                return 0;
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                goto out;
        }
-        left_path = ocfs2_new_path(path_root_bh(path),
+        left_path = ocfs2_new_path_from_path(path);
-                                   path_root_el(path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
        ocfs2_cp_path(left_path, path);
-        right_path = ocfs2_new_path(path_root_bh(path),
+        right_path = ocfs2_new_path_from_path(path);
-                                    path_root_el(path));
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                 * Caller might still want to make changes to the
                 * tree root, so re-add it to the journal here.
                 */
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           path_root_bh(left_path),
+                                                   left_path, 0);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                 * We have a path to the left of this one - it needs
                 * an update too.
                 */
-                left_path = ocfs2_new_path(path_root_bh(path),
+                left_path = ocfs2_new_path_from_path(path);
-                                           path_root_el(path));
                if (!left_path) {
                        ret = -ENOMEM;
                        mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
                 * it up front.
                 */
                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
-                                                       path_leaf_bh(path),
+                                                       path);
-                                                       path_leaf_el(path));
                if (ret)
                        mlog_errno(ret);
                goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
        /* This function shouldn't be called for the rightmost leaf. */
        BUG_ON(right_cpos == 0);
-        right_path = ocfs2_new_path(path_root_bh(left_path),
+        right_path = ocfs2_new_path_from_path(left_path);
-                                    path_root_el(left_path));
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-                ret = ocfs2_journal_access(handle, inode, root_bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   right_path->p_node[i].bh,
+                                                           right_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   left_path->p_node[i].bh,
+                                                           left_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                right_rec = &el->l_recs[index + 1];
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(left_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
        /* This function shouldn't be called for the leftmost leaf. */
        BUG_ON(left_cpos == 0);
-        left_path = ocfs2_new_path(path_root_bh(right_path),
+        left_path = ocfs2_new_path_from_path(right_path);
-                                   path_root_el(right_path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-                ret = ocfs2_journal_access(handle, inode, root_bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   right_path->p_node[i].bh,
+                                                           right_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   left_path->p_node[i].bh,
+                                                           left_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                        has_empty_extent = 1;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(right_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                 * leftmost leaf.
                 */
                if (left_cpos) {
-                        left_path = ocfs2_new_path(path_root_bh(right_path),
+                        left_path = ocfs2_new_path_from_path(right_path);
-                                                   path_root_el(right_path));
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
        struct ocfs2_extent_rec *rec, *tmprec;
-        right_el = path_leaf_el(right_path);;
+        right_el = path_leaf_el(right_path);
        if (left_path)
                left_el = path_leaf_el(left_path);
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
        el = et->et_root_el;
-        ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                goto out_update_clusters;
        }
-        right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        right_path = ocfs2_new_path_from_et(et);
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                 * ocfs2_rotate_tree_right() might have extended the
                 * transaction without re-journaling our tree root.
                 */
-                ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
+                ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        goto out;
                if (left_cpos != 0) {
-                        left_path = ocfs2_new_path(path_root_bh(path),
+                        left_path = ocfs2_new_path_from_path(path);
-                                                   path_root_el(path));
                        if (!left_path)
                                goto out;
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                            le16_to_cpu(new_el->l_count)) {
                                bh = path_leaf_bh(left_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                ocfs2_error(inode->i_sb,
-                                                                 eb);
+                                            "Extent block #%llu has an "
+                                            "invalid l_next_free_rec of "
+                                            "%d.  It should have "
+                                            "matched the l_count of %d",
+                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                            le16_to_cpu(new_el->l_next_free_rec),
+                                            le16_to_cpu(new_el->l_count));
+                                status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                if (right_cpos == 0)
                        goto out;
-                right_path = ocfs2_new_path(path_root_bh(path),
+                right_path = ocfs2_new_path_from_path(path);
-                                            path_root_el(path));
                if (!right_path)
                        goto out;
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
                                bh = path_leaf_bh(right_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                ocfs2_error(inode->i_sb,
-                                                                 eb);
+                                            "Extent block #%llu has an "
+                                            "invalid l_next_free_rec of %d",
+                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                            le16_to_cpu(new_el->l_next_free_rec));
+                                status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
                 * may want it later.
                 */
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
+                ret = ocfs2_read_extent_block(inode,
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                return 0;
        }
-        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
        BUG_ON(num_bits > clusters_to_add);
-        /* reserve our write early -- insert_extent may update the inode */
+        /* reserve our write early -- insert_extent may update the tree root */
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
        if (path->p_tree_depth) {
                struct ocfs2_extent_block *eb;
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+                ret = ocfs2_read_extent_block(inode,
-                                       &last_eb_bh);
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &last_eb_bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
                }
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EROFS;
-                        goto out;
-                }
                rightmost_el = &eb->h_list;
        } else
                rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
        if (et->et_ops == &ocfs2_dinode_et_ops)
                ocfs2_extent_map_trunc(inode, 0);
-        left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        left_path = ocfs2_new_path_from_et(et);
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
        depth = path->p_tree_depth;
        if (depth > 0) {
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+                ret = ocfs2_read_extent_block(inode,
-                                       &last_eb_bh);
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &last_eb_bh);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
                }
                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
-                        left_path = ocfs2_new_path(path_root_bh(path),
+                        left_path = ocfs2_new_path_from_path(path);
-                                                   path_root_el(path));
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
        ocfs2_extent_map_trunc(inode, 0);
-        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -5255,6 +5348,81 @@ out:
        return ret;
 }
+int ocfs2_remove_btree_range(struct inode *inode,
+                             struct ocfs2_extent_tree *et,
+                             u32 cpos, u32 phys_cpos, u32 len,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_et_root_journal_access(handle, inode, et,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        vfs_dq_free_space_nodirty(inode,
+                                  ocfs2_clusters_to_bytes(inode->i_sb, len));
+        ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
+                                  dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_et_update_clusters(inode, et, -len);
+        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
        struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5476,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-                status = -EIO;
-                goto bail;
-        }
+        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+         * by the underlying call to ocfs2_read_inode_block(), so any
+         * corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        tl_count = le16_to_cpu(tl->tl_count);
        mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
                        tl_count == 0,
@@ -5332,8 +5500,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+        status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -5394,8 +5562,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
        while (i >= 0) {
                /* Caller has given us at least enough credits to
                 * update the truncate log dinode */
-                status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+                status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -5464,13 +5632,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-                status = -EIO;
-                goto out;
-        }
+        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+         * by the underlying call to ocfs2_read_inode_block(), so any
+         * corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        num_to_flush = le16_to_cpu(tl->tl_used);
        mlog(0, "Flush %u records from truncate log #%llu\n",
             num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5754,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                iput(inode);
                mlog_errno(status);
@@ -5625,13 +5793,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
        }
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
-                status = -EIO;
-                goto bail;
-        }
+        /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
+         * validated by the underlying call to ocfs2_read_inode_block(),
+         * so any corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        if (le16_to_cpu(tl->tl_used)) {
                mlog(0, "We'll have %u logs to recover\n",
                     le16_to_cpu(tl->tl_used));
@@ -5651,6 +5819,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
                 * tl_used. */
                tl->tl_used = 0;
+                ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
                status = ocfs2_write_block(osb, tl_bh, tl_inode);
                if (status < 0) {
                        mlog_errno(status);
@@ -5800,7 +5969,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 */
 /*
- * Describes a single block free from a suballocator
+ * Describe a single bit freed from a suballocator.  For the block
+ * suballocators, it represents one block.  For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
 */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
@@ -5815,10 +5987,10 @@ struct ocfs2_per_slot_free_list {
        struct ocfs2_cached_block_free          *f_first;
 };
-static int ocfs2_free_cached_items(struct ocfs2_super *osb,
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
-                                   int sysfile_type,
+                                    int sysfile_type,
-                                   int slot,
+                                    int slot,
-                                   struct ocfs2_cached_block_free *head)
+                                    struct ocfs2_cached_block_free *head)
 {
        int ret;
        u64 bg_blkno;
@@ -5893,6 +6065,82 @@ out:
        return ret;
 }
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                u64 blkno, unsigned int bit)
+{
+        int ret = 0;
+        struct ocfs2_cached_block_free *item;
+        item = kmalloc(sizeof(*item), GFP_NOFS);
+        if (item == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                return ret;
+        }
+        mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+             bit, (unsigned long long)blkno);
+        item->free_blk = blkno;
+        item->free_bit = bit;
+        item->free_next = ctxt->c_global_allocator;
+        ctxt->c_global_allocator = item;
+        return ret;
+}
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+                                      struct ocfs2_cached_block_free *head)
+{
+        struct ocfs2_cached_block_free *tmp;
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        int ret = 0;
+        mutex_lock(&tl_inode->i_mutex);
+        while (head) {
+                if (ocfs2_truncate_log_needs_flush(osb)) {
+                        ret = __ocfs2_flush_truncate_log(osb);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                break;
+                        }
+                }
+                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+                                                head->free_bit);
+                ocfs2_commit_trans(osb, handle);
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+        }
+        mutex_unlock(&tl_inode->i_mutex);
+        while (head) {
+                /* Premature exit may have left some dangling items. */
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+        }
+        return ret;
+}
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt)
 {
@@ -5908,8 +6156,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                if (fl->f_first) {
                        mlog(0, "Free items: (type %u, slot %d)\n",
                             fl->f_inode_type, fl->f_slot);
-                        ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
+                        ret2 = ocfs2_free_cached_blocks(osb,
-                                                       fl->f_slot, fl->f_first);
+                                                        fl->f_inode_type,
+                                                        fl->f_slot,
+                                                        fl->f_first);
                        if (ret2)
                                mlog_errno(ret2);
                        if (!ret)
@@ -5920,6 +6170,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                kfree(fl);
        }
+        if (ctxt->c_global_allocator) {
+                ret2 = ocfs2_free_cached_clusters(osb,
+                                                  ctxt->c_global_allocator);
+                if (ret2)
+                        mlog_errno(ret2);
+                if (!ret)
+                        ret = ret2;
+                ctxt->c_global_allocator = NULL;
+        }
        return ret;
 }
@@ -6075,11 +6336,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
        eb = (struct ocfs2_extent_block *) bh->b_data;
        el = &eb->h_list;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-                ret = -EROFS;
+         * Any corruption is a code bug. */
-                goto out;
+        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-        }
        *new_last_eb = bh;
        get_bh(*new_last_eb);
@@ -6326,8 +6586,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
        }
        if (last_eb_bh) {
-                status = ocfs2_journal_access(handle, inode, last_eb_bh,
+                status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -6350,6 +6610,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                goto bail;
        }
+        vfs_dq_free_space_nodirty(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
                                      clusters_to_del;
@@ -6436,11 +6698,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
                mlog_errno(ret);
        else if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                ret = walk_page_buffers(handle, page_buffers(page),
-                                        from, to, &partial,
-                                        ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -6663,6 +6920,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        struct page **pages = NULL;
        loff_t end = osb->s_clustersize;
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        has_data = i_size_read(inode) ? 1 : 0;
@@ -6682,15 +6940,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_inline_to_extents_credits(osb->sb));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out_unlock;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -6701,6 +6960,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                unsigned int page_end;
                u64 phys;
+                if (vfs_dq_alloc_space_nodirty(inode,
+                                       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                        ret = -EDQUOT;
+                        goto out_commit;
+                }
+                did_quota = 1;
                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
@@ -6774,6 +7040,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        }
 out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(inode,
+                                          ocfs2_clusters_to_bytes(osb->sb, 1));
        ocfs2_commit_trans(osb, handle);
 out_unlock:
@@ -6813,7 +7083,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        path = ocfs2_new_path(fe_bh, &di->id2.i_list);
+        path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+                              ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
                mlog_errno(status);
@@ -6984,20 +7255,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
+                status = ocfs2_read_extent_block(inode,
-                                          &last_eb_bh);
+                                                 le64_to_cpu(fe->i_last_eb_blk),
+                                                 &last_eb_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        brelse(last_eb_bh);
-                        status = -EIO;
-                        goto bail;
-                }
        }
        (*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7317,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfbe..cceff5c37f47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
 *
 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
 * ocfs2_extent_tree_operations abstract the normal operations we do for
 * the root of extent b-tree.
 */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
        struct ocfs2_extent_tree_operations     *et_ops;
        struct buffer_head                      *et_root_bh;
        struct ocfs2_extent_list                *et_root_el;
+        ocfs2_journal_access_func               et_root_journal_access;
        void                                    *et_object;
        unsigned int                            et_max_leaf_clusters;
 };
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_buf *vb);
-                                        struct ocfs2_xattr_value_root *xv);
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                            struct buffer_head **bh);
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
                        u32 cpos, u32 len, handle_t *handle,
                        struct ocfs2_alloc_context *meta_ac,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+                             struct ocfs2_extent_tree *et,
+                             u32 cpos, u32 phys_cpos, u32 len,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
 */
 struct ocfs2_cached_dealloc_ctxt {
        struct ocfs2_per_slot_free_list         *c_first_suballocator;
+        struct ocfs2_cached_block_free          *c_global_allocator;
 };
 static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 {
        c->c_first_suballocator = NULL;
+        c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+        return c->c_global_allocator != NULL;
 }
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b33420..a067a6cffb01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-                     fe->i_signature);
-                goto bail;
-        }
        if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
                                                    le32_to_cpu(fe->i_clusters))) {
                mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
        BUG_ON(!PageLocked(page));
        BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
        if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                ret = walk_page_buffers(handle,
-                                        page_buffers(page),
-                                        from, to, NULL,
-                                        ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
                tmppage = wc->w_pages[i];
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode)) {
+                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                                walk_page_buffers(wc->w_handle,
-                                                  page_buffers(tmppage),
-                                                  from, to, NULL,
-                                                  ocfs2_journal_dirty_data);
-#endif
-                        }
                        block_commit_write(tmppage, from, to);
                }
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                ocfs2_commit_trans(osb, handle);
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        wc->w_handle = handle;
+        if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
        /*
         * We don't want this to fail in ocfs2_write_end(), so do it
         * here.
         */
-        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        /*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
                                          len);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        if (data_ac)
@@ -1790,6 +1776,10 @@ success:
        *pagep = wc->w_target_page;
        *fsdata = wc;
        return 0;
+out_quota:
+        if (clusters_to_alloc)
+                vfs_dq_free_space(inode,
+                          ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                }
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode)) {
+                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                                walk_page_buffers(wc->w_handle,
-                                                  page_buffers(tmppage),
-                                                  from, to, NULL,
-                                                  ocfs2_journal_dirty_data);
-#endif
-                        }
                        block_commit_write(tmppage, from, to);
                }
        }
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000000..2a947c44e594
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "blockcheck.h"
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+        unsigned int b, p = 0;
+        /*
+         * Data bits are 0-based, but we're talking code bits, which
+         * are 1-based.
+         */
+        b = i + 1;
+        /* Use the cache if it is there */
+        if (p_cache)
+                p = *p_cache;
+        b += p;
+        /*
+         * For every power of two below our bit number, bump our bit.
+         *
+         * We compare with (b + 1) because we have to compare with what b
+         * would be _if_ it were bumped up by the parity bit.  Capice?
+         *
+         * p is set above.
+         */
+        for (; (1 << p) < (b + 1); p++)
+                b++;
+        if (p_cache)
+                *p_cache = p;
+        return b;
+}
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+        unsigned int i, b, p = 0;
+        BUG_ON(!d);
+        /*
+         * b is the hamming code bit number.  Hamming code specifies a
+         * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+         * for the algorithm.
+         *
+         * The i++ in the for loop is so that the start offset passed
+         * to ocfs2_find_next_bit_set() is one greater than the previously
+         * found bit.
+         */
+        for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+        {
+                /*
+                 * i is the offset in this hunk, nr + i is the total bit
+                 * offset.
+                 */
+                b = calc_code_bit(nr + i, &p);
+                /*
+                 * Data bits in the resultant code are checked by
+                 * parity bits that are part of the bit number
+                 * representation.  Huh?
+                 *
+                 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+                 * In other words, the parity bit at position 2^k
+                 * checks bits in positions having bit k set in
+                 * their binary representation.  Conversely, for
+                 * instance, bit 13, i.e. 1101(2), is checked by
+                 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+                 * </wikipedia>
+                 *
+                 * Note that 'k' is the _code_ bit number.  'b' in
+                 * our loop.
+                 */
+                parity ^= b;
+        }
+        /* While the data buffer was treated as little endian, the
+         * return value is in host endian. */
+        return parity;
+}
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+        return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                       unsigned int fix)
+{
+        unsigned int i, b;
+        BUG_ON(!d);
+        /*
+         * If the bit to fix has an hweight of 1, it's a parity bit.  One
+         * busted parity bit is its own error.  Nothing to do here.
+         */
+        if (hweight32(fix) == 1)
+                return;
+        /*
+         * nr + d is the bit right past the data hunk we're looking at.
+         * If fix after that, nothing to do
+         */
+        if (fix >= calc_code_bit(nr + d, NULL))
+                return;
+        /*
+         * nr is the offset in the data hunk we're starting at.  Let's
+         * start b at the offset in the code buffer.  See hamming_encode()
+         * for a more detailed description of 'b'.
+         */
+        b = calc_code_bit(nr, NULL);
+        /* If the fix is before this hunk, nothing to do */
+        if (fix < b)
+                return;
+        for (i = 0; i < d; i++, b++)
+        {
+                /* Skip past parity bits */
+                while (hweight32(b) == 1)
+                        b++;
+                /*
+                 * i is the offset in this data hunk.
+                 * nr + i is the offset in the total data buffer.
+                 * b is the offset in the total code buffer.
+                 *
+                 * Thus, when b == fix, bit i in the current hunk needs
+                 * fixing.
+                 */
+                if (b == fix)
+                {
+                        if (ocfs2_test_bit(i, data))
+                                ocfs2_clear_bit(i, data);
+                        else
+                                ocfs2_set_bit(i, data);
+                        break;
+                }
+        }
+}
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                             unsigned int fix)
+{
+        ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc)
+{
+        u32 crc;
+        u32 ecc;
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        crc = crc32_le(~0, data, blocksize);
+        ecc = ocfs2_hamming_encode_block(data, blocksize);
+        /*
+         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+         * larger than 16 bits.
+         */
+        BUG_ON(ecc > USHORT_MAX);
+        bc->bc_crc32e = cpu_to_le32(crc);
+        bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        struct ocfs2_block_check check;
+        u32 crc, ecc;
+        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        /* Fast path - if the crc32 validates, we're good to go */
+        crc = crc32_le(~0, data, blocksize);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR,
+             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        /* Ok, try ECC fixups */
+        ecc = ocfs2_hamming_encode_block(data, blocksize);
+        ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+        /* And check the crc32 again */
+        crc = crc32_le(~0, data, blocksize);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        rc = -EIO;
+out:
+        bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+        bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+        return rc;
+}
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc)
+{
+        int i;
+        u32 crc, ecc;
+        BUG_ON(nr < 0);
+        if (!nr)
+                return;
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+                /*
+                 * The number of bits in a buffer is obviously b_size*8.
+                 * The offset of this buffer is b_size*i, so the bit offset
+                 * of this buffer is b_size*8*i.
+                 */
+                ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                                bhs[i]->b_size * 8,
+                                                bhs[i]->b_size * 8 * i);
+        }
+        /*
+         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+         * larger than 16 bits.
+         */
+        BUG_ON(ecc > USHORT_MAX);
+        bc->bc_crc32e = cpu_to_le32(crc);
+        bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc)
+{
+        int i, rc = 0;
+        struct ocfs2_block_check check;
+        u32 crc, ecc, fix;
+        BUG_ON(nr < 0);
+        if (!nr)
+                return 0;
+        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        /* Fast path - if the crc32 validates, we're good to go */
+        for (i = 0, crc = ~0; i < nr; i++)
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR,
+             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        /* Ok, try ECC fixups */
+        for (i = 0, ecc = 0; i < nr; i++) {
+                /*
+                 * The number of bits in a buffer is obviously b_size*8.
+                 * The offset of this buffer is b_size*i, so the bit offset
+                 * of this buffer is b_size*8*i.
+                 */
+                ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                                bhs[i]->b_size * 8,
+                                                bhs[i]->b_size * 8 * i);
+        }
+        fix = ecc ^ check.bc_ecc;
+        for (i = 0; i < nr; i++) {
+                /*
+                 * Try the fix against each buffer.  It will only affect
+                 * one of them.
+                 */
+                ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+                                  bhs[i]->b_size * 8 * i, fix);
+        }
+        /* And check the crc32 again */
+        for (i = 0, crc = ~0; i < nr; i++)
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        rc = -EIO;
+out:
+        bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+        bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+        return rc;
+}
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc)
+{
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+        return rc;
+}
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc)
+{
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+        return rc;
+}
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000000..70ec3feda32f
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc);
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc);
+/*
+ * Hamming code functions
+ */
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+                         unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                       unsigned int fix);
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                                    unsigned int fix);
+#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7c..15c8e6deee2e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
 #include "buffer_head_io.h"
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+        BH_NeedsValidate = BH_JBDPrivateStart,
+};
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                      struct inode *inode)
 {
@@ -166,7 +178,9 @@ bail:
 }
 int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
-                      struct buffer_head *bhs[], int flags)
+                      struct buffer_head *bhs[], int flags,
+                      int (*validate)(struct super_block *sb,
+                                      struct buffer_head *bh))
 {
        int status = 0;
        int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
                        clear_buffer_uptodate(bh);
                        get_bh(bh); /* for end_buffer_read_sync() */
+                        if (validate)
+                                set_buffer_needs_validate(bh);
                        bh->b_end_io = end_buffer_read_sync;
                        submit_bh(READ, bh);
                        continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
                                bhs[i] = NULL;
                                continue;
                        }
+                        if (buffer_needs_validate(bh)) {
+                                /* We never set NeedsValidate if the
+                                 * buffer was held by the journal, so
+                                 * that better not have changed */
+                                BUG_ON(buffer_jbd(bh));
+                                clear_buffer_needs_validate(bh);
+                                status = validate(inode->i_sb, bh);
+                                if (status) {
+                                        put_bh(bh);
+                                        bhs[i] = NULL;
+                                        continue;
+                                }
+                        }
                }
                /* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade7..c75d682dadd8 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
                             int uptodate);
-static inline int ocfs2_read_block(struct inode        *inode,
-                                   u64                  off,
-                                   struct buffer_head **bh);
 int ocfs2_write_block(struct ocfs2_super          *osb,
                      struct buffer_head  *bh,
                      struct inode        *inode);
-int ocfs2_read_blocks(struct inode        *inode,
-                      u64                  block,
-                      int                  nr,
-                      struct buffer_head  *bhs[],
-                      int                  flags);
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
                           unsigned int nr, struct buffer_head *bhs[]);
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+                      struct buffer_head *bhs[], int flags,
+                      int (*validate)(struct super_block *sb,
+                                      struct buffer_head *bh));
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
                                struct buffer_head *bh);
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_READAHEAD         8
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
-                                   struct buffer_head **bh)
+                                   struct buffer_head **bh,
+                                   int (*validate)(struct super_block *sb,
+                                                   struct buffer_head *bh))
 {
        int status = 0;
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
                goto bail;
        }
-        status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+        status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
 bail:
        return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c03..04697ba7f73e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
-                 * o2hb_do_disk_heartbeat so that we avoid more then
+                 * o2hb_do_disk_heartbeat so that we avoid more than
                 * hr_timeout_ms between disk writes. On busy systems
                 * this should result in a heartbeat which is less
                 * likely to time itself out. */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef6..96df5416993e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUORUM),
        define_mask(EXPORT),
        define_mask(XATTR),
+        define_mask(QUOTA),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c680471..7e72a81bc2d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
 #define ML_QUORUM       0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT       0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b1cc7c381e88..e9d7c2038c0f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -38,6 +38,7 @@
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
+#include "super.h"
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
@@ -294,6 +295,34 @@ out_attach:
        return ret;
 }
+static DEFINE_SPINLOCK(dentry_list_lock);
+/* We limit the number of dentry locks to drop in one go. We have
+ * this limit so that we don't starve other users of ocfs2_wq. */
+#define DL_INODE_DROP_COUNT 64
+/* Drop inode references from dentry locks */
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+                                               dentry_lock_work);
+        struct ocfs2_dentry_lock *dl;
+        int drop_count = DL_INODE_DROP_COUNT;
+        spin_lock(&dentry_list_lock);
+        while (osb->dentry_lock_list && drop_count--) {
+                dl = osb->dentry_lock_list;
+                osb->dentry_lock_list = dl->dl_next;
+                spin_unlock(&dentry_list_lock);
+                iput(dl->dl_inode);
+                kfree(dl);
+                spin_lock(&dentry_list_lock);
+        }
+        if (osb->dentry_lock_list)
+                queue_work(ocfs2_wq, &osb->dentry_lock_work);
+        spin_unlock(&dentry_list_lock);
+}
 /*
 * ocfs2_dentry_iput() and friends.
 *
@@ -318,16 +347,23 @@ out_attach:
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
                                   struct ocfs2_dentry_lock *dl)
 {
-        iput(dl->dl_inode);
        ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
        ocfs2_lock_res_free(&dl->dl_lockres);
-        kfree(dl);
+        /* We leave dropping of inode reference to ocfs2_wq as that can
+         * possibly lead to inode deletion which gets tricky */
+        spin_lock(&dentry_list_lock);
+        if (!osb->dentry_lock_list)
+                queue_work(ocfs2_wq, &osb->dentry_lock_work);
+        dl->dl_next = osb->dentry_lock_list;
+        osb->dentry_lock_list = dl;
+        spin_unlock(&dentry_list_lock);
 }
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
                           struct ocfs2_dentry_lock *dl)
 {
-        int unlock = 0;
+        int unlock;
        BUG_ON(dl->dl_count == 0);
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34d9883..d06e16c06640 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,8 +29,13 @@
 extern struct dentry_operations ocfs2_dentry_ops;
 struct ocfs2_dentry_lock {
+        /* Use count of dentry lock */
        unsigned int            dl_count;
-        u64                     dl_parent_blkno;
+        union {
+                /* Linked list of dentry locks to release */
+                struct ocfs2_dentry_lock *dl_next;
+                u64                     dl_parent_blkno;
+        };
        /*
         * The ocfs2_dentry_lock keeps an inode reference until
@@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
                           struct ocfs2_dentry_lock *dl);
+void ocfs2_drop_dl_inodes(struct work_struct *work);
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
                                      int skip_unhashed);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb85187..f2c4098cf337 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -47,6 +48,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
-static struct buffer_head *ocfs2_bread(struct inode *inode,
+/*
-                                       int block, int *err, int reada)
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
 {
-        struct buffer_head *bh = NULL;
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-        int tmperr;
+                return 0;
-        u64 p_blkno;
-        int readflags = 0;
-        if (reada)
+        return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
-                readflags |= OCFS2_BH_READAHEAD;
+}
-        if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
-            i_size_read(inode)) {
+{
-                BUG_ON(!reada);
+        return ocfs2_meta_ecc(osb);
-                return NULL;
+}
-        }
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
-        tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+{
-                                             NULL);
+        return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+}
-        if (tmperr < 0) {
-                mlog_errno(tmperr);
-                goto fail;
-        }
-        tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
-        if (tmperr < 0)
-                goto fail;
-        tmperr = 0;
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+                                                            void *data)
+{
+        char *p = data;
-        *err = 0;
+        p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
-        return bh;
+        return (struct ocfs2_dir_block_trailer *)p;
+}
-fail:
+/*
-        brelse(bh);
+ * XXX: This is executed once on every dirent. We should consider optimizing
-        bh = NULL;
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+                                  struct ocfs2_dir_entry *de,
+                                  unsigned long offset,
+                                  unsigned long blklen)
+{
+        unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
-        *err = -EIO;
+        if (!ocfs2_dir_has_trailer(dir))
-        return NULL;
+                return 0;
+        if (offset != toff)
+                return 0;
+        return 1;
+}
+static void ocfs2_init_dir_trailer(struct inode *inode,
+                                   struct buffer_head *bh)
+{
+        struct ocfs2_dir_block_trailer *trailer;
+        trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+        strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+        trailer->db_compat_rec_len =
+                        cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+        trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+        trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
 }
 /*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -250,6 +277,108 @@ out:
        return NULL;
 }
+static int ocfs2_validate_dir_block(struct super_block *sb,
+                                    struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_trailer_from_bh(bh, sb);
+        /*
+         * We don't validate dirents here, that's handled
+         * in-place when the code walks them.
+         */
+        mlog(0, "Validating dirblock %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         *
+         * Note that we are safe to call this even if the directory
+         * doesn't have a trailer.  Filesystems without metaecc will do
+         * nothing, and filesystems with it will have one.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+        if (rc)
+                mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+        return rc;
+}
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+                                struct buffer_head **bh, int flags)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        struct ocfs2_dir_block_trailer *trailer;
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+                                    ocfs2_validate_dir_block);
+        if (rc) {
+                mlog_errno(rc);
+                goto out;
+        }
+        /*
+         * We check the trailer here rather than in
+         * ocfs2_validate_dir_block() because that function doesn't have
+         * the inode to test.
+         */
+        if (!(flags & OCFS2_BH_READAHEAD) &&
+            ocfs2_dir_has_trailer(inode)) {
+                trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+                if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Invalid dirblock #%llu: "
+                                    "signature = %.*s\n",
+                                    (unsigned long long)tmp->b_blocknr, 7,
+                                    trailer->db_signature);
+                        goto out;
+                }
+                if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Directory block #%llu has an invalid "
+                                    "db_blkno of %llu",
+                                    (unsigned long long)tmp->b_blocknr,
+                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                        goto out;
+                }
+                if (le64_to_cpu(trailer->db_parent_dinode) !=
+                    OCFS2_I(inode)->ip_blkno) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Directory block #%llu on dinode "
+                                    "#%llu has an invalid parent_dinode "
+                                    "of %llu",
+                                    (unsigned long long)tmp->b_blocknr,
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                        goto out;
+                }
+        }
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!*bh)
+                *bh = tmp;
+out:
+        return rc ? -EIO : 0;
+}
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
                                               struct inode *dir,
                                               struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
                                }
                                num++;
-                                bh = ocfs2_bread(dir, b++, &err, 1);
+                                bh = NULL;
+                                err = ocfs2_read_dir_block(dir, b++, &bh,
+                                                           OCFS2_BH_READAHEAD);
                                bh_use[ra_max] = bh;
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
                        goto next;
-                if (ocfs2_read_block(dir, block, &bh)) {
+                if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
                        /* read error, skip block & hope for the best.
-                         * ocfs2_read_block() has released the bh. */
+                         * ocfs2_read_dir_block() has released the bh. */
                        ocfs2_error(dir->i_sb, "reading directory %llu, "
                                    "offset %lu\n",
                                    (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
                       struct inode *new_entry_inode)
 {
        int ret;
+        ocfs2_journal_access_func access = ocfs2_journal_access_db;
        /*
         * The same code works fine for both inline-data and extent
-         * based directories, so no need to split this up.
+         * based directories, so no need to split this up.  The only
+         * difference is the journal_access function.
         */
-        ret = ocfs2_journal_access(handle, dir, de_bh,
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                access = ocfs2_journal_access_di;
+        ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 {
        struct ocfs2_dir_entry *de, *pde;
        int i, status = -ENOENT;
+        ocfs2_journal_access_func access = ocfs2_journal_access_db;
        mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                access = ocfs2_journal_access_di;
        i = 0;
        pde = NULL;
        de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        goto bail;
                }
                if (de == de_del)  {
-                        status = ocfs2_journal_access(handle, dir, bh,
+                        status = access(handle, dir, bh,
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                        OCFS2_JOURNAL_ACCESS_WRITE);
                        if (status < 0) {
                                status = -EIO;
                                mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
                        goto bail;
                }
+                /* We're guaranteed that we should have space, so we
+                 * can't possibly have hit the trailer...right? */
+                mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+                                "Hit dir trailer trying to insert %.*s "
+                                "(namelen %d) into directory %llu.  "
+                                "offset is %lu, trailer offset is %d\n",
+                                namelen, name, namelen,
+                                (unsigned long long)parent_fe_bh->b_blocknr,
+                                offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
                        retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
                                goto bail;
                        }
-                        status = ocfs2_journal_access(handle, dir, insert_bh,
+                        if (insert_bh == parent_fe_bh)
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                status = ocfs2_journal_access_di(handle, dir,
+                                                                 insert_bh,
+                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                        else
+                                status = ocfs2_journal_access_db(handle, dir,
+                                                                 insert_bh,
+                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                        /* By now the buffer is marked for journaling */
                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
                        retval = 0;
                        goto bail;
                }
                offset += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
        }
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
        struct ocfs2_inline_data *data;
        struct ocfs2_dir_entry *de;
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
        int i, stored;
        struct buffer_head * bh, * tmp;
        struct ocfs2_dir_entry * de;
-        int err;
        struct super_block * sb = inode->i_sb;
        unsigned int ra_sectors = 16;
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
        while (!error && !stored && *f_pos < i_size_read(inode)) {
                blk = (*f_pos) >> sb->s_blocksize_bits;
-                bh = ocfs2_bread(inode, blk, &err, 0);
+                if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
-                if (!bh) {
+                        /* Skip the corrupt dirblock and keep trying */
-                        mlog(ML_ERROR,
-                             "directory #%llu contains a hole at offset %lld\n",
-                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                             *f_pos);
                        *f_pos += sb->s_blocksize - offset;
                        continue;
                }
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
                        for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
                             i > 0; i--) {
-                                tmp = ocfs2_bread(inode, ++blk, &err, 1);
+                                tmp = NULL;
-                                brelse(tmp);
+                                if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+                                                          OCFS2_BH_READAHEAD))
+                                        brelse(tmp);
                        }
                        last_ra_blk = blk;
                        ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
                }
                offset = 0;
                brelse(bh);
+                bh = NULL;
        }
        stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
        return !priv.seen_other;
 }
-static void ocfs2_fill_initial_dirents(struct inode *inode,
+/*
-                                       struct inode *parent,
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
-                                       char *start, unsigned int size)
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+                                                          struct inode *parent,
+                                                          char *start,
+                                                          unsigned int size)
 {
        struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
        de->name_len = 2;
        strcpy(de->name, "..");
        ocfs2_set_de_type(de, S_IFDIR);
+        return de;
 }
 /*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        struct ocfs2_inline_data *data = &di->id2.i_data;
        unsigned int size = le16_to_cpu(data->id_count);
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                                 struct ocfs2_alloc_context *data_ac)
 {
        int status;
+        unsigned int size = osb->sb->s_blocksize;
        struct buffer_head *new_bh = NULL;
+        struct ocfs2_dir_entry *de;
        mlog_entry_void();
+        if (ocfs2_supports_dir_trailer(osb))
+                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
                                     data_ac, NULL, &new_bh);
        if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        ocfs2_set_new_buffer_uptodate(inode, new_bh);
-        status = ocfs2_journal_access(handle, inode, new_bh,
+        status = ocfs2_journal_access_db(handle, inode, new_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
-        ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
+        de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
-                                   osb->sb->s_blocksize);
+        if (ocfs2_supports_dir_trailer(osb))
+                ocfs2_init_dir_trailer(inode, new_bh);
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                                     data_ac);
 }
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
 static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-                                     unsigned int new_size)
+                                     struct super_block *sb)
 {
        struct ocfs2_dir_entry *de;
        struct ocfs2_dir_entry *prev_de;
        char *de_buf, *limit;
-        unsigned int bytes = new_size - old_size;
+        unsigned int new_size = sb->s_blocksize;
+        unsigned int bytes;
+        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+                new_size = ocfs2_dir_trailer_blk_off(sb);
+        bytes = new_size - old_size;
        limit = start + old_size;
        de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int blocks_wanted,
                                   struct buffer_head **first_block_bh)
 {
-        int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
        u32 alloc, bit_off, len;
        struct super_block *sb = dir->i_sb;
+        int ret, credits = ocfs2_inline_to_extents_credits(sb);
        u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_sem;
        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
        /*
         * Try to claim as many clusters as the bitmap can give though
         * if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
-        ret = ocfs2_journal_access(handle, dir, dirdata_bh,
+        ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
        memset(dirdata_bh->b_data + i_size_read(dir), 0,
               sb->s_blocksize - i_size_read(dir));
-        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
+        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
-                                 sb->s_blocksize);
+        if (ocfs2_supports_dir_trailer(osb))
+                ocfs2_init_dir_trailer(dir, dirdata_bh);
        ret = ocfs2_journal_dirty(handle, dirdata_bh);
        if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * We let the later dirent insert modify c/mtime - to the user
         * the data hasn't changed.
         */
-        ret = ocfs2_journal_access(handle, dir, di_bh,
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        dirdata_bh = NULL;
 out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                        ocfs2_clusters_to_bytes(osb->sb, 2));
        ocfs2_commit_trans(osb, handle);
 out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct buffer_head **new_bh)
 {
        int status;
-        int extend;
+        int extend, did_quota = 0;
        u64 p_blkno, v_blkno;
        spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        if (extend) {
                u32 offset = OCFS2_I(dir)->ip_clusters;
+                if (vfs_dq_alloc_space_nodirty(dir,
+                                        ocfs2_clusters_to_bytes(sb, 1))) {
+                        status = -EDQUOT;
+                        goto bail;
+                }
+                did_quota = 1;
                status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
                                              1, 0, parent_fe_bh, handle,
                                              data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        }
        status = 0;
 bail:
+        if (did_quota && status < 0)
+                vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
        mlog_exit(status);
        return status;
 }
@@ -1569,16 +1771,22 @@ do_extend:
        ocfs2_set_new_buffer_uptodate(dir, new_bh);
-        status = ocfs2_journal_access(handle, dir, new_bh,
+        status = ocfs2_journal_access_db(handle, dir, new_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        memset(new_bh->b_data, 0, sb->s_blocksize);
        de = (struct ocfs2_dir_entry *) new_bh->b_data;
        de->inode = 0;
-        de->rec_len = cpu_to_le16(sb->s_blocksize);
+        if (ocfs2_dir_has_trailer(dir)) {
+                de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+                ocfs2_init_dir_trailer(dir, new_bh);
+        } else {
+                de->rec_len = cpu_to_le16(sb->s_blocksize);
+        }
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int *blocks_wanted)
 {
        int ret;
+        struct super_block *sb = dir->i_sb;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_dir_entry *de, *last_de = NULL;
        char *de_buf, *limit;
        unsigned long offset = 0;
-        unsigned int rec_len, new_rec_len;
+        unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+        /*
+         * This calculates how many free bytes we'd have in block zero, should
+         * this function force expansion to an extent tree.
+         */
+        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+                free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+        else
+                free_space = dir->i_sb->s_blocksize - i_size_read(dir);
        de_buf = di->id2.i_data.id_data;
        limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
                        ret = -EEXIST;
                        goto out;
                }
+                /*
+                 * No need to check for a trailing dirent record here as
+                 * they're not used for inline dirs.
+                 */
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        /* Ok, we found a spot. Return this bh and let
                         * the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
         * dirent can be found.
         */
        *blocks_wanted = 1;
-        new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+        new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
        if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
                *blocks_wanted = 2;
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
        struct ocfs2_dir_entry *de;
        struct super_block *sb = dir->i_sb;
        int status;
+        int blocksize = dir->i_sb->s_blocksize;
-        bh = ocfs2_bread(dir, 0, &status, 0);
+        status = ocfs2_read_dir_block(dir, 0, &bh, 0);
-        if (!bh) {
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                                status = -ENOSPC;
                                goto bail;
                        }
-                        bh = ocfs2_bread(dir,
+                        status = ocfs2_read_dir_block(dir,
-                                         offset >> sb->s_blocksize_bits,
+                                             offset >> sb->s_blocksize_bits,
-                                         &status,
+                                             &bh, 0);
-                                         0);
+                        if (status) {
-                        if (!bh) {
                                mlog_errno(status);
                                goto bail;
                        }
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = -EEXIST;
                        goto bail;
                }
+                if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+                                           blocksize))
+                        goto next;
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        /* Ok, we found a spot. Return this bh and let
                         * the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = 0;
                        goto bail;
                }
+next:
                offset += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
        }
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d87..c511e2e18e9f 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       struct buffer_head *fe_bh,
                       struct ocfs2_alloc_context *data_ac);
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+                                                            void *data);
 #endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8ba..d07ddbe4b283 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        struct list_head *iter, *head=NULL;
        u64 cookie;
        u32 flags;
+        u8 node;
        if (!dlm_grab(dlm)) {
                dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        name = past->name;
        locklen = past->namelen;
-        cookie = be64_to_cpu(past->cookie);
+        cookie = past->cookie;
        flags = be32_to_cpu(past->flags);
+        node = past->node_idx;
        if (locklen > DLM_LOCKID_NAME_MAX) {
                ret = DLM_IVBUFLEN;
-                mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+                mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+                     "handler!\n", locklen);
                goto leave;
        }
        if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
             (LKM_PUT_LVB|LKM_GET_LVB)) {
-                mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+                mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+                     flags);
                ret = DLM_BADARGS;
                goto leave;
        }
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        if (past->type != DLM_AST &&
            past->type != DLM_BAST) {
                mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
-                     "name=%.*s\n", past->type, 
+                     "name=%.*s, node=%u\n", past->type,
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-                     locklen, name);
+                     locklen, name, node);
                ret = DLM_IVLOCKID;
                goto leave;
        }
        res = dlm_lookup_lockres(dlm, name, locklen);
        if (!res) {
-                mlog(0, "got %sast for unknown lockres! "
+                mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
-                     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
+                     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
-                     past->type == DLM_AST ? "" : "b",
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     locklen, name, node);
-                     locklen, name, locklen);
                ret = DLM_IVLOCKID;
                goto leave;
        }
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_RECOVERING) {
-                mlog(0, "responding with DLM_RECOVERING!\n");
+                mlog(0, "Responding with DLM_RECOVERING!\n");
                ret = DLM_RECOVERING;
                goto unlock_out;
        }
        if (res->state & DLM_LOCK_RES_MIGRATING) {
-                mlog(0, "responding with DLM_MIGRATING!\n");
+                mlog(0, "Responding with DLM_MIGRATING!\n");
                ret = DLM_MIGRATING;
                goto unlock_out;
        }
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        lock = NULL;
        list_for_each(iter, head) {
                lock = list_entry (iter, struct dlm_lock, list);
-                if (be64_to_cpu(lock->ml.cookie) == cookie)
+                if (lock->ml.cookie == cookie)
                        goto do_ast;
        }
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        list_for_each(iter, head) {
                lock = list_entry (iter, struct dlm_lock, list);
-                if (be64_to_cpu(lock->ml.cookie) == cookie)
+                if (lock->ml.cookie == cookie)
                        goto do_ast;
        }
-        mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
+        mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
-             "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
+             "node=%u\n", past->type == DLM_AST ? "" : "b",
-             dlm_get_lock_cookie_node(cookie),
+             dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-             dlm_get_lock_cookie_seq(cookie),
+             dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-             locklen, name, locklen);
+             locklen, name, node);
        ret = DLM_NORMAL;
 unlock_out:
@@ -383,8 +386,8 @@ do_ast:
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
                list_move_tail(&lock->list, &res->granted);
-                mlog(0, "ast: adding to granted list... type=%d, "
+                mlog(0, "ast: Adding to granted list... type=%d, "
-                          "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+                     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
                        lock->ml.type = lock->ml.convert_type;
                        lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
                dlm_do_local_bast(dlm, res, lock, past->blocked_type);
 leave:
        if (res)
                dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a49..bb53714813ab 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
        unsigned int purge_count;
        spinlock_t spinlock;
        spinlock_t ast_lock;
+        spinlock_t track_lock;
        char *name;
        u8 node_num;
        u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
         * put on a list for the dlm thread to run. */
        unsigned long    last_used;
+        struct dlm_ctxt *dlm;
        unsigned migration_pending:1;
        atomic_t asts_reserved;
        spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175d..b32f60a5acfb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
 {
        struct debug_lockres *dl = m->private;
        struct dlm_ctxt *dlm = dl->dl_ctxt;
+        struct dlm_lock_resource *oldres = dl->dl_res;
        struct dlm_lock_resource *res = NULL;
+        struct list_head *track_list;
-        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->track_lock);
+        if (oldres)
+                track_list = &oldres->tracking;
+        else
+                track_list = &dlm->tracking_list;
-        if (dl->dl_res) {
+        list_for_each_entry(res, track_list, tracking) {
-                list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
+                if (&res->tracking == &dlm->tracking_list)
-                        if (dl->dl_res) {
+                        res = NULL;
-                                dlm_lockres_put(dl->dl_res);
+                else
-                                dl->dl_res = NULL;
-                        }
-                        if (&res->tracking == &dlm->tracking_list) {
-                                mlog(0, "End of list found, %p\n", res);
-                                dl = NULL;
-                                break;
-                        }
                        dlm_lockres_get(res);
-                        dl->dl_res = res;
+                break;
-                        break;
-                }
-        } else {
-                if (!list_empty(&dlm->tracking_list)) {
-                        list_for_each_entry(res, &dlm->tracking_list, tracking)
-                                break;
-                        dlm_lockres_get(res);
-                        dl->dl_res = res;
-                } else
-                        dl = NULL;
        }
+        spin_unlock(&dlm->track_lock);
-        if (dl) {
+        if (oldres)
-                spin_lock(&dl->dl_res->spinlock);
+                dlm_lockres_put(oldres);
-                dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
-                spin_unlock(&dl->dl_res->spinlock);
-        }
-        spin_unlock(&dlm->spinlock);
+        dl->dl_res = res;
+        if (res) {
+                spin_lock(&res->spinlock);
+                dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+                spin_unlock(&res->spinlock);
+        } else
+                dl = NULL;
+        /* passed to seq_show */
        return dl;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e8..d8d578f45613 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        spin_lock_init(&dlm->spinlock);
        spin_lock_init(&dlm->master_lock);
        spin_lock_init(&dlm->ast_lock);
+        spin_lock_init(&dlm->track_lock);
        INIT_LIST_HEAD(&dlm->list);
        INIT_LIST_HEAD(&dlm->dirty_list);
        INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d54020..1c9efb406a96 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
-                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-        inode->i_blocks = 0;
        inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf3683..54e182a27caf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
 static void dlm_lockres_release(struct kref *kref)
 {
        struct dlm_lock_resource *res;
+        struct dlm_ctxt *dlm;
        res = container_of(kref, struct dlm_lock_resource, refs);
+        dlm = res->dlm;
        /* This should not happen -- all lockres' have a name
         * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
+        spin_lock(&dlm->track_lock);
        if (!list_empty(&res->tracking))
                list_del_init(&res->tracking);
        else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
                     res->lockname.len, res->lockname.name);
                dlm_print_one_lock_resource(res);
        }
+        spin_unlock(&dlm->track_lock);
+        dlm_put(dlm);
        if (!hlist_unhashed(&res->hash_node) ||
            !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->migration_pending = 0;
        res->inflight_locks = 0;
+        /* put in dlm_lockres_release */
+        dlm_grab(dlm);
+        res->dlm = dlm;
        kref_init(&res->refs);
        /* just for consistency */
@@ -722,14 +732,21 @@ lookup:
        if (tmpres) {
                int dropping_ref = 0;
+                spin_unlock(&dlm->spinlock);
                spin_lock(&tmpres->spinlock);
+                /* We wait for the other thread that is mastering the resource */
+                if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+                        __dlm_wait_on_lockres(tmpres);
+                        BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+                }
                if (tmpres->owner == dlm->node_num) {
                        BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
                        dlm_lockres_grab_inflight_ref(dlm, tmpres);
                } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
                        dropping_ref = 1;
                spin_unlock(&tmpres->spinlock);
-                spin_unlock(&dlm->spinlock);
                /* wait until done messaging the master, drop our ref to allow
                 * the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                  struct dlm_node_iter *iter)
 {
        struct dlm_migrate_request migrate;
-        int ret, status = 0;
+        int ret, skip, status = 0;
        int nodenum;
        memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                    nodenum == new_master)
                        continue;
+                /* We could race exit domain. If exited, skip. */
+                spin_lock(&dlm->spinlock);
+                skip = (!test_bit(nodenum, dlm->domain_map));
+                spin_unlock(&dlm->spinlock);
+                if (skip) {
+                        clear_bit(nodenum, iter->node_map);
+                        continue;
+                }
                ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
-                if (ret < 0)
+                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(0, "migrate_request returned %d!\n", ret);
-                else if (status < 0) {
+                        if (!dlm_is_host_down(ret)) {
+                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+                                BUG();
+                        }
+                        clear_bit(nodenum, iter->node_map);
+                        ret = 0;
+                } else if (status < 0) {
                        mlog(0, "migrate request (node %u) returned %d!\n",
                             nodenum, status);
                        ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d1295203029f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
                spin_lock(&res->spinlock);
                /* This ensures that clear refmap is sent after the set */
-                __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+                __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+                                                  DLM_LOCK_RES_MIGRATING));
                spin_unlock(&res->spinlock);
                /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f7..206a2370876a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
 /*
 * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
                                     struct ocfs2_lock_res *lockres);
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
                                     unsigned int line,
                                     struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb =
+        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        mlog(level, "LVB information for %s (called from %s:%u):\n",
             lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+        .set_lvb        = ocfs2_set_qinfo_lvb,
+        .get_osb        = ocfs2_get_qinfo_osb,
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
        return (struct ocfs2_dentry_lock *)lockres->l_priv;
 }
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+        BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+        return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
 static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 {
        if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
        return OCFS2_SB(inode->i_sb);
 }
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+        return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
 {
        struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
        lockres->l_flags |= OCFS2_LOCK_NOCACHE;
 }
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info)
+{
+        ocfs2_lock_res_init_once(lockres);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+                              0, lockres->l_name);
+        ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+                                   OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+                                   info);
+}
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
        mlog_entry_void();
@@ -1290,7 +1324,7 @@ again:
                        goto out;
                }
-                mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
+                mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
                     lockres->l_name);
                /* At this point we've gone inside the dlm and need to
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
        mlog_entry_void();
-        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        /*
         * Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        mlog_meta_lvb(0, lockres);
-        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        /* We're safe here without the lockres lock... */
        spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
                                              struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb =
+        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        if (lvb->lvb_version == OCFS2_LVB_VERSION
            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
        } else {
                /* Boo, we have to go to disk. */
                /* read bh, cast, ocfs2_refresh_inode */
-                status = ocfs2_read_block(inode, oi->ip_blkno, bh);
+                status = ocfs2_read_inode_block(inode, bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
                fe = (struct ocfs2_dinode *) (*bh)->b_data;
                /* This is a good chance to make sure we're not
-                 * locking an invalid object.
+                 * locking an invalid object.  ocfs2_read_inode_block()
+                 * already checked that the inode block is sane.
                 *
                 * We bug on a stale inode here because we checked
                 * above whether it was wiped from disk. The wiping
                 * node provides a guarantee that we receive that
                 * message and can mark the inode before dropping any
                 * locks associated with it. */
-                if (!OCFS2_IS_VALID_DINODE(fe)) {
-                        OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                        status = -EIO;
-                        goto bail_refresh;
-                }
                mlog_bug_on_msg(inode->i_generation !=
                                le32_to_cpu(fe->i_generation),
                                "Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
                return 0;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
+        status = ocfs2_read_inode_block(inode, ret_bh);
        if (status < 0)
                mlog_errno(status);
@@ -2831,6 +2860,10 @@ static void ocfs2_unlock_ast(void *opaque, int error)
        case OCFS2_UNLOCK_CANCEL_CONVERT:
                mlog(0, "Cancel convert success for %s\n", lockres->l_name);
                lockres->l_action = OCFS2_AST_INVALID;
+                /* Downconvert thread may have requeued this lock, we
+                 * need to wake it. */
+                if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+                        ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
                break;
        case OCFS2_UNLOCK_DROP_LOCK:
                lockres->l_level = DLM_LOCK_IV;
@@ -2922,7 +2955,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
                ocfs2_dlm_dump_lksb(&lockres->l_lksb);
                BUG();
        }
-        mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
+        mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
             lockres->l_name);
        ocfs2_wait_on_busy_lock(lockres);
@@ -3449,6 +3482,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
        return UNBLOCK_CONTINUE_POST;
 }
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_qinfo_lvb *lvb;
+        struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+        struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+                                            oinfo->dqi_gi.dqi_type);
+        mlog_entry_void();
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+        lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+        lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+        lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+        lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+        lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+        lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+        mlog_exit_void();
+}
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+        mlog_entry_void();
+        if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+                ocfs2_cluster_unlock(osb, lockres, level);
+        mlog_exit_void();
+}
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+        struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+                                            oinfo->dqi_gi.dqi_type);
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        struct buffer_head *bh = NULL;
+        struct ocfs2_global_disk_dqinfo *gdinfo;
+        int status = 0;
+        if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+                info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+                info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+                oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+                oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+                oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+                oinfo->dqi_gi.dqi_free_entry =
+                                        be32_to_cpu(lvb->lvb_free_entry);
+        } else {
+                status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                gdinfo = (struct ocfs2_global_disk_dqinfo *)
+                                        (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+                info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+                info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+                oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+                oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+                oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+                oinfo->dqi_gi.dqi_free_entry =
+                                        le32_to_cpu(gdinfo->dqi_free_entry);
+                brelse(bh);
+                ocfs2_track_lock_refresh(lockres);
+        }
+bail:
+        return status;
+}
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+        int status = 0;
+        mlog_entry_void();
+        /* On RO devices, locking really isn't needed... */
+        if (ocfs2_is_hard_readonly(osb)) {
+                if (ex)
+                        status = -EROFS;
+                goto bail;
+        }
+        if (ocfs2_mount_local(osb))
+                goto bail;
+        status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        if (!ocfs2_should_refresh_lock_res(lockres))
+                goto bail;
+        /* OK, we have the lock but we need to refresh the quota info */
+        status = ocfs2_refresh_qinfo(oinfo);
+        if (status)
+                ocfs2_qinfo_unlock(oinfo, ex);
+        ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+        mlog_exit(status);
+        return status;
+}
 /*
 * This is the filesystem locking protocol.  It provides the lock handling
 * hooks for the underlying DLM.  It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b1..3f8d9986b8e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
        __be32       lvb_reserved2;
 };
+#define OCFS2_QINFO_LVB_VERSION 1
+struct ocfs2_qinfo_lvb {
+        __u8    lvb_version;
+        __u8    lvb_reserved[3];
+        __be32  lvb_bgrace;
+        __be32  lvb_igrace;
+        __be32  lvb_syncms;
+        __be32  lvb_blocks;
+        __be32  lvb_free_blk;
+        __be32  lvb_free_entry;
+};
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY        (0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
 struct ocfs2_file_private;
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
 void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac58234..f2bb1a04d253 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-        ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+        ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
        el = &eb->h_list;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                ret = -EROFS;
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                goto out;
-        }
        if (el->l_tree_depth) {
                ocfs2_error(inode->i_sb,
                            "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
                if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
                        goto no_more_extents;
-                ret = ocfs2_read_block(inode,
+                ret = ocfs2_read_extent_block(inode,
-                                       le64_to_cpu(eb->h_next_leaf_blk),
+                                              le64_to_cpu(eb->h_next_leaf_blk),
-                                       &next_eb_bh);
+                                              &next_eb_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-                        ret = -EROFS;
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-                        goto out;
-                }
+                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
                el = &next_eb->h_list;
                i = ocfs2_search_for_hole_index(el, v_cluster);
        }
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
        if (ret == 0)
                goto out;
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -819,3 +806,74 @@ out:
        return ret;
 }
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+                           struct buffer_head *bhs[], int flags,
+                           int (*validate)(struct super_block *sb,
+                                           struct buffer_head *bh))
+{
+        int rc = 0;
+        u64 p_block, p_count;
+        int i, count, done = 0;
+        mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+                   "flags = %x, validate = %p)\n",
+                   inode, (unsigned long long)v_block, nr, bhs, flags,
+                   validate);
+        if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+            i_size_read(inode)) {
+                BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+                goto out;
+        }
+        while (done < nr) {
+                down_read(&OCFS2_I(inode)->ip_alloc_sem);
+                rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+                                                 &p_block, &p_count, NULL);
+                up_read(&OCFS2_I(inode)->ip_alloc_sem);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+                if (!p_block) {
+                        rc = -EIO;
+                        mlog(ML_ERROR,
+                             "Inode #%llu contains a hole at offset %llu\n",
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                             (unsigned long long)(v_block + done) <<
+                             inode->i_sb->s_blocksize_bits);
+                        break;
+                }
+                count = nr - done;
+                if (p_count < count)
+                        count = p_count;
+                /*
+                 * If the caller passed us bhs, they should have come
+                 * from a previous readahead call to this function.  Thus,
+                 * they should have the right b_blocknr.
+                 */
+                for (i = 0; i < count; i++) {
+                        if (!bhs[done + i])
+                                continue;
+                        BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+                }
+                rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
+                                       flags, validate);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+                done += count;
+        }
+out:
+        mlog_exit(rc);
+        return rc;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f34..b7dd9731b462 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
                             u32 *p_cluster, u32 *num_clusters,
                             struct ocfs2_extent_list *el);
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+                           struct buffer_head *bhs[], int flags,
+                           int (*validate)(struct super_block *sb,
+                                           struct buffer_head *bh));
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+                                        struct buffer_head **bh,
+                                        int (*validate)(struct super_block *sb,
+                                                        struct buffer_head *bh))
+{
+        int status = 0;
+        if (bh == NULL) {
+                printk("ocfs2: bh == NULL\n");
+                status = -EINVAL;
+                goto bail;
+        }
+        status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+bail:
+        return status;
+}
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b2..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -56,6 +57,8 @@
 #include "suballoc.h"
 #include "super.h"
 #include "xattr.h"
+#include "acl.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_journal_access_di(handle, inode, bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -303,9 +306,9 @@ bail:
        return status;
 }
-static int ocfs2_simple_size_update(struct inode *inode,
+int ocfs2_simple_size_update(struct inode *inode,
-                                    struct buffer_head *di_bh,
+                             struct buffer_head *di_bh,
-                                    u64 new_i_size)
+                             u64 new_i_size)
 {
        int ret;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
                goto out;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
                   (unsigned long long)new_i_size);
+        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
+         * already validated it */
        fe = (struct ocfs2_dinode *) di_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto bail;
-        }
        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
                        "Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        enum ocfs2_alloc_restarted why;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
         */
        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto leave;
-        }
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
        }
 restarted_transaction:
+        if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+            clusters_to_add))) {
+                status = -EDQUOT;
+                goto leave;
+        }
+        did_quota = 1;
        /* reserve a write to the file entry early on - that we if we
         * run out of credits in the allocation path, we can still
         * update i_size. */
-        status = ocfs2_journal_access(handle, inode, bh,
+        status = ocfs2_journal_access_di(handle, inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
        spin_lock(&OCFS2_I(inode)->ip_lock);
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
        spin_unlock(&OCFS2_I(inode)->ip_lock);
+        /* Release unused quota reservation */
+        vfs_dq_free_space(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+        did_quota = 0;
        if (why != RESTART_NONE && clusters_to_add) {
                if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
             OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 leave:
+        if (status < 0 && did_quota)
+                vfs_dq_free_space(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
        if (handle) {
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        struct buffer_head *bh = NULL;
        handle_t *handle = NULL;
+        int locked[MAXQUOTAS] = {0, 0};
+        int credits, qtype;
+        struct ocfs2_mem_dqinfo *oinfo;
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-        if (IS_ERR(handle)) {
+            (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                status = PTR_ERR(handle);
+                credits = OCFS2_INODE_UPDATE_CREDITS;
-                mlog_errno(status);
+                if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
-                goto bail_unlock;
+                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                        oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0)
+                                goto bail_unlock;
+                        credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+                                ocfs2_calc_qdel_credits(sb, USRQUOTA);
+                        locked[USRQUOTA] = 1;
+                }
+                if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                        oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0)
+                                goto bail_unlock;
+                        credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+                                   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
+                        locked[GRPQUOTA] = 1;
+                }
+                handle = ocfs2_start_trans(osb, credits);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail_unlock;
+                }
+                status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+                if (status < 0)
+                        goto bail_commit;
+        } else {
+                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail_unlock;
+                }
        }
        /*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
+        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+                if (!locked[qtype])
+                        continue;
+                oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
+                ocfs2_unlock_global_qf(oinfo, 1);
+        }
        ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
        if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
 bail:
        brelse(bh);
+        if (!status && attr->ia_valid & ATTR_MODE) {
+                status = ocfs2_acl_chmod(inode);
+                if (status < 0)
+                        mlog_errno(status);
+        }
        mlog_exit(status);
        return status;
 }
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
                goto out;
        }
-        ret = generic_permission(inode, mask, NULL);
+        ret = generic_permission(inode, mask, ocfs2_check_acl);
        ocfs2_inode_unlock(inode, 0);
 out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_journal_access_di(handle, inode, bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 {
        int ret;
        struct buffer_head *bh = NULL;
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
+        ret = ocfs2_read_inode_block(inode, &bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
        struct buffer_head *di_bh = NULL;
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+                ret = ocfs2_read_inode_block(inode, &di_bh);
-                                       &di_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1226,83 +1284,6 @@ out:
        return ret;
 }
-static int __ocfs2_remove_inode_range(struct inode *inode,
-                                      struct buffer_head *di_bh,
-                                      u32 cpos, u32 phys_cpos, u32 len,
-                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
-{
-        int ret;
-        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct inode *tl_inode = osb->osb_tl_inode;
-        handle_t *handle;
-        struct ocfs2_alloc_context *meta_ac = NULL;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_extent_tree et;
-        ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
-        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
-        if (ret) {
-                mlog_errno(ret);
-                return ret;
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                ret = __ocfs2_flush_truncate_log(osb);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-                                  dealloc);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        OCFS2_I(inode)->ip_clusters -= len;
-        di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
-        if (ret)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
-out:
-        mutex_unlock(&tl_inode->i_mutex);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        return ret;
-}
 /*
 * Truncate a byte range, avoiding pages within partial clusters. This
 * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct address_space *mapping = inode->i_mapping;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
        if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                /* Only do work for non-holes */
                if (phys_cpos != 0) {
-                        ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
+                        ret = ocfs2_remove_btree_range(inode, &et, cpos,
-                                                         phys_cpos, alloc_size,
+                                                       phys_cpos, alloc_size,
-                                                         &dealloc);
+                                                       &dealloc);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
                            struct ocfs2_space_resv *sr)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
            !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5f..172f9fbc9fc7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
                         struct ocfs2_alloc_context *data_ac,
                         struct ocfs2_alloc_context *meta_ac,
                         enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+                             struct buffer_head *di_bh,
+                             u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
                          u64 zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d511874..229e707bc050 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include <asm/byteorder.h>
@@ -37,6 +38,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
        return 0;
 }
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-                         int create_ino)
+                          int create_ino)
 {
        struct super_block *sb;
        struct ocfs2_super *osb;
-        int status = -EINVAL;
        int use_plocks = 1;
        mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
            ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
                use_plocks = 0;
-        /* this means that read_inode cannot create a superblock inode
+        /*
-         * today.  change if needed. */
+         * These have all been checked by ocfs2_read_inode_block() or set
-        if (!OCFS2_IS_VALID_DINODE(fe) ||
+         * by ocfs2_mknod_locked(), so a failure is a code bug.
-            !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+         */
-                mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));  /* This means that read_inode
-                     "signature = %.*s, flags = 0x%x\n",
+                                                cannot create a superblock
-                     inode->i_ino,
+                                                inode today.  change if
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+                                                that is needed. */
-                     fe->i_signature, le32_to_cpu(fe->i_flags));
+        BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
-                goto bail;
+        BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
-        }
-        if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
-                mlog(ML_ERROR, "file entry generation does not match "
-                     "superblock! osb->fs_generation=%x, "
-                     "fe->i_fs_generation=%x\n",
-                     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
-                goto bail;
-        }
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
-        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+                inode->i_flags |= S_NOQUOTA;
+        }
        if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
                mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
        } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+        } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+                inode->i_flags |= S_NOQUOTA;
        } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
                mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
                /* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        ocfs2_set_inode_flags(inode);
-        status = 0;
+        mlog_exit_void();
-bail:
-        mlog_exit(status);
-        return status;
 }
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                }
        }
-        if (can_lock)
+        if (can_lock) {
-                status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+                status = ocfs2_read_inode_block_full(inode, &bh,
-                                           OCFS2_BH_IGNORE_CACHE);
+                                                     OCFS2_BH_IGNORE_CACHE);
-        else
+        } else {
                status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+                if (!status)
+                        status = ocfs2_validate_inode_block(osb->sb, bh);
+        }
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        status = -EINVAL;
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)args->fi_blkno, 7,
-                     fe->i_signature);
-                goto bail;
-        }
        /*
         * This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
            S_ISBLK(le16_to_cpu(fe->i_mode)))
-                inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+                inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
-        if (ocfs2_populate_inode(inode, fe, 0) < 0)
+        ocfs2_populate_inode(inode, fe, 0);
-                goto bail;
        BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                        goto out;
                }
-                status = ocfs2_journal_access(handle, inode, fe_bh,
+                status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
                goto bail;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS);
+        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+                                        ocfs2_quota_trans_credits(inode->i_sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        /* set the inodes dtime */
-        status = ocfs2_journal_access(handle, inode, di_bh,
+        status = ocfs2_journal_access_di(handle, inode, di_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        ocfs2_remove_from_cache(inode, di_bh);
+        vfs_dq_free_inode(inode);
        status = ocfs2_free_dinode(handle, inode_alloc_inode,
                                   inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
-        if (is_bad_inode(inode)) {
+        /* When we fail in read_inode() we mark inode as bad. The second test
+         * catches the case when inode allocation fails before allocating
+         * a block for inode. */
+        if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
                mlog(0, "Skipping delete of bad inode\n");
                goto bail;
        }
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        mlog_entry("(inode %llu)\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        status = ocfs2_journal_access(handle, inode, bh,
+        status = ocfs2_journal_access_di(handle, inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
+int ocfs2_validate_inode_block(struct super_block *sb,
+                               struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+        mlog(0, "Validating dinode %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+        if (rc) {
+                mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                goto bail;
+        }
+        /*
+         * Errors after here are fatal.
+         */
+        rc = -EINVAL;
+        if (!OCFS2_IS_VALID_DINODE(di)) {
+                ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            di->i_signature);
+                goto bail;
+        }
+        if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(di->i_blkno));
+                goto bail;
+        }
+        if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+                ocfs2_error(sb,
+                            "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+                            (unsigned long long)bh->b_blocknr);
+                goto bail;
+        }
+        if (le32_to_cpu(di->i_fs_generation) !=
+            OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Invalid dinode #%llu: fs_generation is %u\n",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(di->i_fs_generation));
+                goto bail;
+        }
+        rc = 0;
+bail:
+        return rc;
+}
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+                                int flags)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
+                               flags, ocfs2_validate_inode_block);
+        /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+        return ocfs2_read_inode_block_full(inode, bh, 0);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4a..eb3c302b38d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-                         int create_ino);
+                          int create_ino);
 void ocfs2_read_inode(struct inode *inode);
 void ocfs2_read_inode2(struct inode *inode, void *opaque);
 ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
                           struct buffer_head *bh);
 int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+                                int block, int *err, int reada);
 void ocfs2_set_inode_flags(struct inode *inode);
 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
        return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
 }
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+                               struct buffer_head *bh);
+/*
+ * Read an inode block into *bh.  If *bh is NULL, a bh will be allocated.
+ * This is a cached read.  The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+                                int flags);
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3c..57d7d25a2b9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -45,6 +46,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-                              int node_num);
+                              int node_num, int slot_num);
 static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
                                      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+        return __ocfs2_wait_on_mount(osb, 0);
+}
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+        return __ocfs2_wait_on_mount(osb, 1);
+}
 /*
 * The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
        BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
        BUG_ON(max_buffs <= 0);
-        /* JBD might support this, but our journalling code doesn't yet. */
+        /* Nested transaction? Just return the handle... */
-        if (journal_current_handle()) {
+        if (journal_current_handle())
-                mlog(ML_ERROR, "Recursive transaction attempted!\n");
+                return jbd2_journal_start(journal, max_buffs);
-                BUG();
-        }
        down_read(&osb->journal->j_trans_barrier);
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 int ocfs2_commit_trans(struct ocfs2_super *osb,
                       handle_t *handle)
 {
-        int ret;
+        int ret, nested;
        struct ocfs2_journal *journal = osb->journal;
        BUG_ON(!handle);
+        nested = handle->h_ref > 1;
        ret = jbd2_journal_stop(handle);
        if (ret < 0)
                mlog_errno(ret);
-        up_read(&journal->j_trans_barrier);
+        if (!nested)
+                up_read(&journal->j_trans_barrier);
        return ret;
 }
@@ -357,10 +370,137 @@ bail:
        return status;
 }
-int ocfs2_journal_access(handle_t *handle,
+struct ocfs2_triggers {
-                         struct inode *inode,
+        struct jbd2_buffer_trigger_type ot_triggers;
-                         struct buffer_head *bh,
+        int                             ot_offset;
-                         int type)
+};
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+        return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_disk_dqtrailer *dqt =
+                ocfs2_block_dqtrailer(size, data);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_dir_trailer_from_size(size, data);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                struct buffer_head *bh)
+{
+        mlog(ML_ERROR,
+             "ocfs2_abort_trigger called by JBD2.  bh = 0x%lx, "
+             "bh->b_blocknr = %llu\n",
+             (unsigned long)bh,
+             (unsigned long long)bh->b_blocknr);
+        /* We aren't guaranteed to have the superblock here - but if we
+         * don't, it'll just crash. */
+        ocfs2_error(bh->b_assoc_map->host->i_sb,
+                    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+static struct ocfs2_triggers di_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dinode, i_check),
+};
+static struct ocfs2_triggers eb_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_extent_block, h_check),
+};
+static struct ocfs2_triggers gd_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_group_desc, bg_check),
+};
+static struct ocfs2_triggers db_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_db_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+};
+static struct ocfs2_triggers xb_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_xattr_block, xb_check),
+};
+static struct ocfs2_triggers dq_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_dq_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+};
+static int __ocfs2_journal_access(handle_t *handle,
+                                  struct inode *inode,
+                                  struct buffer_head *bh,
+                                  struct ocfs2_triggers *triggers,
+                                  int type)
 {
        int status;
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
                status = -EINVAL;
                mlog(ML_ERROR, "Uknown access type!\n");
        }
+        if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+                jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
        mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
        if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
        return status;
 }
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+                               struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
+                                      type);
+}
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
+                                      type);
+}
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+                                      type);
+}
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
+                                      type);
+}
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
+                                      type);
+}
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
+                                      type);
+}
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+                         struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
 int ocfs2_journal_dirty(handle_t *handle,
                        struct buffer_head *bh)
 {
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
        return status;
 }
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
-                             struct buffer_head *bh)
-{
-        int err = journal_dirty_data(handle, bh);
-        if (err)
-                mlog_errno(err);
-        /* TODO: When we can handle it, abort the handle and go RO on
-         * error here. */
-        return err;
-}
-#endif
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        mlog_entry_void();
        fe = (struct ocfs2_dinode *)bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                /* This is called from startup/shutdown which will
+        /* The journal bh on the osb always comes from ocfs2_journal_init()
-                 * handle the errors in a specific manner, so no need
+         * and was validated there inside ocfs2_inode_lock_full().  It's a
-                 * to call ocfs2_error() here. */
+         * code bug if we mess it up. */
-                mlog(ML_ERROR, "Journal dinode %llu  has invalid "
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-                     "signature: %.*s",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-                     fe->i_signature);
-                status = -EIO;
-                goto out;
-        }
        flags = le32_to_cpu(fe->id1.journal1.ij_flags);
        if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        if (replayed)
                ocfs2_bump_recovery_generation(fe);
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
        status = ocfs2_write_block(osb, bh, journal->j_inode);
        if (status < 0)
                mlog_errno(status);
-out:
        mlog_exit(status);
        return status;
 }
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
        int                     lri_slot;
        struct ocfs2_dinode     *lri_la_dinode;
        struct ocfs2_dinode     *lri_tl_dinode;
+        struct ocfs2_quota_recovery *lri_qrec;
 };
 /* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
        struct ocfs2_super *osb = journal->j_osb;
        struct ocfs2_dinode *la_dinode, *tl_dinode;
        struct ocfs2_la_recovery_item *item, *n;
+        struct ocfs2_quota_recovery *qrec;
        LIST_HEAD(tmp_la_list);
        mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
                mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+                ocfs2_wait_on_quotas(osb);
                la_dinode = item->lri_la_dinode;
                if (la_dinode) {
                        mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
                if (ret < 0)
                        mlog_errno(ret);
+                qrec = item->lri_qrec;
+                if (qrec) {
+                        mlog(0, "Recovering quota files");
+                        ret = ocfs2_finish_quota_recovery(osb, qrec,
+                                                          item->lri_slot);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                        /* Recovery info is already freed now */
+                }
                kfree(item);
        }
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
-                                            struct ocfs2_dinode *tl_dinode)
+                                            struct ocfs2_dinode *tl_dinode,
+                                            struct ocfs2_quota_recovery *qrec)
 {
        struct ocfs2_la_recovery_item *item;
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                if (tl_dinode)
                        kfree(tl_dinode);
+                if (qrec)
+                        ocfs2_free_quota_recovery(qrec);
                mlog_errno(-ENOMEM);
                return;
        }
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
        item->lri_la_dinode = la_dinode;
        item->lri_slot = slot_num;
        item->lri_tl_dinode = tl_dinode;
+        item->lri_qrec = qrec;
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
                ocfs2_queue_recovery_completion(journal,
                                                osb->slot_num,
                                                osb->local_alloc_copy,
+                                                NULL,
                                                NULL);
                ocfs2_schedule_truncate_log_flush(osb, 0);
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        }
 }
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+        if (osb->quota_rec) {
+                ocfs2_queue_recovery_completion(osb->journal,
+                                                osb->slot_num,
+                                                NULL,
+                                                NULL,
+                                                osb->quota_rec);
+                osb->quota_rec = NULL;
+        }
+}
 static int __ocfs2_recovery_thread(void *arg)
 {
-        int status, node_num;
+        int status, node_num, slot_num;
        struct ocfs2_super *osb = arg;
        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        int *rm_quota = NULL;
+        int rm_quota_used = 0, i;
+        struct ocfs2_quota_recovery *qrec;
        mlog_entry_void();
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
                goto bail;
        }
+        rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+        if (!rm_quota) {
+                status = -ENOMEM;
+                goto bail;
+        }
 restart:
        status = ocfs2_super_lock(osb, 1);
        if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
                 * clear it until ocfs2_recover_node() has succeeded. */
                node_num = rm->rm_entries[0];
                spin_unlock(&osb->osb_lock);
+                mlog(0, "checking node %d\n", node_num);
-                status = ocfs2_recover_node(osb, node_num);
+                slot_num = ocfs2_node_num_to_slot(osb, node_num);
+                if (slot_num == -ENOENT) {
+                        status = 0;
+                        mlog(0, "no slot for this node, so no recovery"
+                             "required.\n");
+                        goto skip_recovery;
+                }
+                mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+                /* It is a bit subtle with quota recovery. We cannot do it
+                 * immediately because we have to obtain cluster locks from
+                 * quota files and we also don't want to just skip it because
+                 * then quota usage would be out of sync until some node takes
+                 * the slot. So we remember which nodes need quota recovery
+                 * and when everything else is done, we recover quotas. */
+                for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+                if (i == rm_quota_used)
+                        rm_quota[rm_quota_used++] = slot_num;
+                status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
                if (!status) {
                        ocfs2_recovery_map_clear(osb, node_num);
                } else {
@@ -1055,13 +1285,27 @@ restart:
        if (status < 0)
                mlog_errno(status);
+        /* Now it is right time to recover quotas... We have to do this under
+         * superblock lock so that noone can start using the slot (and crash)
+         * before we recover it */
+        for (i = 0; i < rm_quota_used; i++) {
+                qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+                if (IS_ERR(qrec)) {
+                        status = PTR_ERR(qrec);
+                        mlog_errno(status);
+                        continue;
+                }
+                ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+                                                NULL, NULL, qrec);
+        }
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
         * node(s) may have disallowd a previos inode delete. Re-processing
         * is therefore required. */
        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                        NULL);
+                                        NULL, NULL);
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
        mutex_unlock(&osb->recovery_lock);
+        if (rm_quota)
+                kfree(rm_quota);
        mlog_exit(status);
        /* no one is callint kthread_stop() for us so the kthread() api
         * requires that we call do_exit().  And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
        }
        SET_INODE_JOURNAL(inode);
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+        status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
-                                   OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        osb->slot_recovery_generations[slot_num] =
                                        ocfs2_get_recovery_generation(fe);
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
        status = ocfs2_write_block(osb, bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
 * far less concerning.
 */
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-                              int node_num)
+                              int node_num, int slot_num)
 {
        int status = 0;
-        int slot_num;
        struct ocfs2_dinode *la_copy = NULL;
        struct ocfs2_dinode *tl_copy = NULL;
-        mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+        mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
-                   node_num, osb->node_num);
+                   node_num, slot_num, osb->node_num);
-        mlog(0, "checking node %d\n", node_num);
        /* Should not ever be called to recover ourselves -- in that
         * case we should've called ocfs2_journal_load instead. */
        BUG_ON(osb->node_num == node_num);
-        slot_num = ocfs2_node_num_to_slot(osb, node_num);
-        if (slot_num == -ENOENT) {
-                status = 0;
-                mlog(0, "no slot for this node, so no recovery required.\n");
-                goto done;
-        }
-        mlog(0, "node %d was using slot %d\n", node_num, slot_num);
        status = ocfs2_replay_journal(osb, node_num, slot_num);
        if (status < 0) {
                if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        /* This will kfree the memory pointed to by la_copy and tl_copy */
        ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-                                        tl_copy);
+                                        tl_copy, NULL);
        status = 0;
 done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
        return ret;
 }
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
 {
        /* This check is good because ocfs2 will wait on our recovery
         * thread before changing it to something other than MOUNTED
         * or DISABLED. */
        wait_event(osb->osb_mount_event,
-                   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+                  (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+                   atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
                   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
        /* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3cea..3c3532e1307c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
 #define OCFS2_JOURNAL_H
 #include <linux/fs.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
+#include <linux/jbd2.h>
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
 enum ocfs2_journal_state {
        OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void   ocfs2_recovery_thread(struct ocfs2_super *osb,
                             int node_num);
 int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
 void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
 *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
 *                          commit the handle to disk in the process, but will
 *                          not release any locks taken during the transaction.
- *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *  ocfs2_journal_access* - Notify the handle that we want to journal this
 *                          buffer. Will have to call ocfs2_journal_dirty once
 *                          we've actually dirtied it. Type is one of . or .
+ *                          Always call the specific flavor of
+ *                          ocfs2_journal_access_*() unless you intend to
+ *                          manage the checksum by hand.
 *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
 *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
 *                           the current handle commits.
@@ -248,10 +247,29 @@ int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
 #define OCFS2_JOURNAL_ACCESS_WRITE  1
 #define OCFS2_JOURNAL_ACCESS_UNDO   2
-int                  ocfs2_journal_access(handle_t *handle,
-                                          struct inode *inode,
+/* ocfs2_inode */
-                                          struct buffer_head *bh,
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
-                                          int type);
+                            struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+                         struct buffer_head *bh, int type);
 /*
 * A word about the journal_access/journal_dirty "dance". It is
 * entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int                  ocfs2_journal_access(handle_t *handle,
 */
 int                  ocfs2_journal_dirty(handle_t *handle,
                                         struct buffer_head *bh);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int                  ocfs2_journal_dirty_data(handle_t *handle,
-                                              struct buffer_head *bh);
-#endif
 /*
 *  Credit Macros:
@@ -293,6 +307,37 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+        int credits = 0;
+        if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+                credits += OCFS2_QWRITE_CREDITS;
+        if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+                credits += OCFS2_QWRITE_CREDITS;
+        return credits;
+}
+/* Number of credits needed for removing quota structure from file */
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
+/* Number of credits needed for initialization of new quota structure */
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -303,8 +348,11 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
-#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC           \
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
-                                         + OCFS2_INODE_UPDATE_CREDITS)
+{
+        return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* dinode + group descriptor update. We don't relink on free yet. */
 #define OCFS2_SUBALLOC_FREE  (2)
@@ -313,16 +361,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE                 \
                                         + OCFS2_TRUNCATE_LOG_UPDATE)
-#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+        return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
 * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
 /* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
+ * group descriptor + mkdir/symlink blocks + quota update */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
+static inline int ocfs2_mknod_credits(struct super_block *sb)
-                            + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+{
+        return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* local alloc metadata change + main bitmap updates */
 #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
@@ -332,13 +387,21 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 * for the dinode, one for the new block. */
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
-/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
-#define OCFS2_LINK_CREDITS  (2*OCFS2_INODE_UPDATE_CREDITS + 1)
+ * update on dir */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+        return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+               ocfs2_quota_trans_credits(sb);
+}
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
 * dir inode link */
-#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
+static inline int ocfs2_unlink_credits(struct super_block *sb)
-                              + OCFS2_LINK_CREDITS)
+{
+        /* The quota update from ocfs2_link_credits is unused here... */
+        return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+}
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
 * inode alloc group descriptor */
@@ -347,8 +410,10 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
 * directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
+static inline int ocfs2_rename_credits(struct super_block *sb)
-                             + OCFS2_UNLINK_CREDITS)
+{
+        return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+}
 /* global bitmap dinode, group desc., relinked group,
 * suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
         * credit for the dinode there. */
        extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
-        return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
+        return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+               ocfs2_quota_trans_credits(sb);
 }
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-        int blocks = OCFS2_MKNOD_CREDITS;
+        int blocks = ocfs2_mknod_credits(sb);
        /* links can be longer than one block so we may update many
         * within our single allocated extent. */
        blocks += ocfs2_clusters_to_blocks(sb, 1);
-        return blocks;
+        return blocks + ocfs2_quota_trans_credits(sb);
 }
 static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
        /* update to the truncate log. */
        credits += OCFS2_TRUNCATE_LOG_UPDATE;
+        credits += ocfs2_quota_trans_credits(sb);
        return credits;
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c32..ec70cdbe77fc 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+        status = ocfs2_read_inode_block_full(inode, &alloc_bh,
-                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+                                             OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        }
        memcpy(alloc_copy, alloc, bh->b_size);
-        status = ocfs2_journal_access(handle, local_alloc_inode, bh,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        mutex_lock(&inode->i_mutex);
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+        status = ocfs2_read_inode_block_full(inode, &alloc_bh,
-                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+                                             OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
        status = ocfs2_write_block(osb, alloc_bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
         * delete bits from it! */
        *num_bits = bits_wanted;
-        status = ocfs2_journal_access(handle, local_alloc_inode,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode,
-                                      osb->local_alloc_bh,
+                                         osb->local_alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        }
        memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
-        status = ocfs2_journal_access(handle, local_alloc_inode,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode,
-                                      osb->local_alloc_bh,
+                                         osb->local_alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402efe..084aba86c3b2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -61,17 +62,18 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "xattr.h"
+#include "acl.h"
 #include "buffer_head_io.h"
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                              struct inode *dir,
-                              struct dentry *dentry, int mode,
+                              struct inode *inode,
+                              struct dentry *dentry,
                              dev_t dev,
                              struct buffer_head **new_fe_bh,
                              struct buffer_head *parent_fe_bh,
                              handle_t *handle,
-                              struct inode **ret_inode,
                              struct ocfs2_alloc_context *inode_ac);
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
        return ret;
 }
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+        struct inode *inode;
+        inode = new_inode(dir->i_sb);
+        if (!inode) {
+                mlog(ML_ERROR, "new_inode failed!\n");
+                return NULL;
+        }
+        /* populate as many fields early on as possible - many of
+         * these are used by the support functions here and in
+         * callers. */
+        if (S_ISDIR(mode))
+                inode->i_nlink = 2;
+        else
+                inode->i_nlink = 1;
+        inode->i_uid = current_fsuid();
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+        vfs_dq_init(inode);
+        return inode;
+}
 static int ocfs2_mknod(struct inode *dir,
                       struct dentry *dentry,
                       int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
        struct inode *inode = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *xattr_ac = NULL;
+        int want_clusters = 0;
+        int xattr_credits = 0;
+        struct ocfs2_security_xattr_info si = {
+                .enable = 1,
+        };
+        int did_quota_inode = 0;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
-        /* Reserve a cluster if creating an extent based directory. */
+        inode = ocfs2_get_init_inode(dir, mode);
-        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
+        if (!inode) {
-                status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+                status = -ENOMEM;
-                if (status < 0) {
+                mlog_errno(status);
-                        if (status != -ENOSPC)
+                goto leave;
-                                mlog_errno(status);
+        }
+        /* get security xattr */
+        status = ocfs2_init_security_get(inode, dir, &si);
+        if (status) {
+                if (status == -EOPNOTSUPP)
+                        si.enable = 0;
+                else {
+                        mlog_errno(status);
                        goto leave;
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
+        /* calculate meta data/clusters for setting security and acl xattr */
+        status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+                                        &si, &want_clusters,
+                                        &xattr_credits, &xattr_ac);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        /* Reserve a cluster if creating an extent based directory. */
+        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+                want_clusters += 1;
+        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto leave;
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+                                   xattr_credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+         * to be called. */
+        if (sb_any_quota_active(osb->sb) &&
+            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+                status = -EDQUOT;
+                goto leave;
+        }
+        did_quota_inode = 1;
        /* do the real work now. */
-        status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+        status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
                                    &new_fe_bh, parent_fe_bh, handle,
-                                    &inode, inode_ac);
+                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
                        goto leave;
                }
-                status = ocfs2_journal_access(handle, dir, parent_fe_bh,
+                status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
                inc_nlink(dir);
        }
+        status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+                                xattr_ac, data_ac);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        if (si.enable) {
+                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+                                                 xattr_ac, data_ac);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto leave;
+                }
+        }
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
                                 de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
        d_instantiate(dentry, inode);
        status = 0;
 leave:
+        if (status < 0 && did_quota_inode)
+                vfs_dq_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -331,9 +425,13 @@ leave:
        brelse(new_fe_bh);
        brelse(de_bh);
        brelse(parent_fe_bh);
+        kfree(si.name);
+        kfree(si.value);
-        if ((status < 0) && inode)
+        if ((status < 0) && inode) {
+                clear_nlink(inode);
                iput(inode);
+        }
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (xattr_ac)
+                ocfs2_free_alloc_context(xattr_ac);
        mlog_exit(status);
        return status;
@@ -348,12 +449,12 @@ leave:
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                              struct inode *dir,
-                              struct dentry *dentry, int mode,
+                              struct inode *inode,
+                              struct dentry *dentry,
                              dev_t dev,
                              struct buffer_head **new_fe_bh,
                              struct buffer_head *parent_fe_bh,
                              handle_t *handle,
-                              struct inode **ret_inode,
                              struct ocfs2_alloc_context *inode_ac)
 {
        int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        struct ocfs2_extent_list *fel;
        u64 fe_blkno = 0;
        u16 suballoc_bit;
-        struct inode *inode = NULL;
-        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-                   (unsigned long)dev, dentry->d_name.len,
+                   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
                   dentry->d_name.name);
        *new_fe_bh = NULL;
-        *ret_inode = NULL;
        status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
                                       &fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                goto leave;
        }
-        inode = new_inode(dir->i_sb);
-        if (!inode) {
-                status = -ENOMEM;
-                mlog(ML_ERROR, "new_inode failed!\n");
-                goto leave;
-        }
        /* populate as many fields early on as possible - many of
         * these are used by the support functions here and in
         * callers. */
        inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
        OCFS2_I(inode)->ip_blkno = fe_blkno;
-        if (S_ISDIR(mode))
-                inode->i_nlink = 2;
-        else
-                inode->i_nlink = 1;
-        inode->i_mode = mode;
        spin_lock(&osb->osb_lock);
        inode->i_generation = osb->s_next_generation++;
        spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        }
        ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
-        status = ocfs2_journal_access(handle, inode, *new_fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_blkno = cpu_to_le64(fe_blkno);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
-        fe->i_uid = cpu_to_le32(current_fsuid());
+        fe->i_uid = cpu_to_le32(inode->i_uid);
-        if (dir->i_mode & S_ISGID) {
+        fe->i_gid = cpu_to_le32(inode->i_gid);
-                fe->i_gid = cpu_to_le32(dir->i_gid);
+        fe->i_mode = cpu_to_le16(inode->i_mode);
-                if (S_ISDIR(mode))
+        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-                        mode |= S_ISGID;
-        } else
-                fe->i_gid = cpu_to_le32(current_fsgid());
-        fe->i_mode = cpu_to_le16(mode);
-        if (S_ISCHR(mode) || S_ISBLK(mode))
                fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
        fe->i_links_count = cpu_to_le16(inode->i_nlink);
        fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        /*
         * If supported, directories start with inline data.
         */
-        if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+        if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
                u16 feat = le16_to_cpu(fe->i_dyn_features);
                fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                goto leave;
        }
-        if (ocfs2_populate_inode(inode, fe, 1) < 0) {
+        ocfs2_populate_inode(inode, fe, 1);
-                mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
-                     "i_blkno=%llu, i_ino=%lu\n",
-                     (unsigned long long)(*new_fe_bh)->b_blocknr,
-                     (unsigned long long)le64_to_cpu(fe->i_blkno),
-                     inode->i_ino);
-                BUG();
-        }
        ocfs2_inode_set_new(osb, inode);
        if (!ocfs2_mount_local(osb)) {
                status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        status = 0; /* error in ocfs2_create_new_inode_locks is not
                     * critical */
-        *ret_inode = inode;
 leave:
        if (status < 0) {
                if (*new_fe_bh) {
                        brelse(*new_fe_bh);
                        *new_fe_bh = NULL;
                }
-                if (inode) {
-                        clear_nlink(inode);
-                        iput(inode);
-                }
        }
        mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
-        err = ocfs2_journal_access(handle, inode, fe_bh,
+        err = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (err < 0) {
                mlog_errno(err);
                goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
                goto leave;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
                                goto bail;
                        }
                }
-                status = ocfs2_journal_access(handle, new_inode, newfe_bh,
+                status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
        old_inode->i_ctime = CURRENT_TIME;
        mark_inode_dirty(old_inode);
-        status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
+        status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status >= 0) {
                old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
                             (int)old_dir_nlink, old_dir->i_nlink);
                } else {
                        struct ocfs2_dinode *fe;
-                        status = ocfs2_journal_access(handle, old_dir,
+                        status = ocfs2_journal_access_di(handle, old_dir,
-                                                      old_dir_bh,
+                                                         old_dir_bh,
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                                         OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
                        status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
        handle_t *handle = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *xattr_ac = NULL;
+        int want_clusters = 0;
+        int xattr_credits = 0;
+        struct ocfs2_security_xattr_info si = {
+                .enable = 1,
+        };
+        int did_quota = 0, did_quota_inode = 0;
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
-        /* don't reserve bitmap space for fast symlinks. */
+        inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
-        if (l > ocfs2_fast_symlink_chars(sb)) {
+        if (!inode) {
-                status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        /* get security xattr */
+        status = ocfs2_init_security_get(inode, dir, &si);
+        if (status) {
+                if (status == -EOPNOTSUPP)
+                        si.enable = 0;
+                else {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        /* calculate meta data/clusters for setting security xattr */
+        if (si.enable) {
+                status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+                                                  &xattr_credits, &xattr_ac);
                if (status < 0) {
-                        if (status != -ENOSPC)
+                        mlog_errno(status);
-                                mlog_errno(status);
                        goto bail;
                }
        }
-        handle = ocfs2_start_trans(osb, credits);
+        /* don't reserve bitmap space for fast symlinks. */
+        if (l > ocfs2_fast_symlink_chars(sb))
+                want_clusters += 1;
+        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        handle = ocfs2_start_trans(osb, credits + xattr_credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
-        status = ocfs2_mknod_locked(osb, dir, dentry,
+        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
-                                    S_IFLNK | S_IRWXUGO, 0,
+         * to be called. */
-                                    &new_fe_bh, parent_fe_bh, handle,
+        if (sb_any_quota_active(osb->sb) &&
-                                    &inode, inode_ac);
+            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+                status = -EDQUOT;
+                goto bail;
+        }
+        did_quota_inode = 1;
+        status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+                                    0, &new_fe_bh, parent_fe_bh, handle,
+                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
+                if (vfs_dq_alloc_space_nodirty(inode,
+                    ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                        status = -EDQUOT;
+                        goto bail;
+                }
+                did_quota = 1;
                status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
                                              new_fe_bh,
                                              handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
                }
        }
+        if (si.enable) {
+                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+                                                 xattr_ac, data_ac);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
                                 de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
 bail:
+        if (status < 0 && did_quota)
+                vfs_dq_free_space_nodirty(inode,
+                                        ocfs2_clusters_to_bytes(osb->sb, 1));
+        if (status < 0 && did_quota_inode)
+                vfs_dq_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -1640,12 +1772,18 @@ bail:
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
        brelse(de_bh);
+        kfree(si.name);
+        kfree(si.value);
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
-        if ((status < 0) && inode)
+        if (xattr_ac)
+                ocfs2_free_alloc_context(xattr_ac);
+        if ((status < 0) && inode) {
+                clear_nlink(inode);
                iput(inode);
+        }
        mlog_exit(status);
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
-        status = ocfs2_read_block(orphan_dir_inode,
+        status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
-                                  OCFS2_I(orphan_dir_inode)->ip_blkno,
-                                  &orphan_dir_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
+        status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                goto leave;
        }
-        status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
+        status = ocfs2_journal_access_di(handle,orphan_dir_inode,  orphan_dir_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d8992..077384135f4e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
 {
        VOLUME_INIT = 0,
        VOLUME_MOUNTED,
+        VOLUME_MOUNTED_QUOTAS,
        VOLUME_DISMOUNTED,
        VOLUME_DISABLED
 };
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
        OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
        OCFS2_MOUNT_INODE64 = 1 << 7,   /* Allow inode numbers > 2^32 */
+        OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
+        OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+        OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
@@ -205,6 +209,8 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_quota_recovery;
+struct ocfs2_dentry_lock;
 struct ocfs2_super
 {
        struct task_struct *commit_task;
@@ -286,10 +292,11 @@ struct ocfs2_super
        char *local_alloc_debug_buf;
 #endif
-        /* Next two fields are for local node slot recovery during
+        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
        struct ocfs2_dinode *local_alloc_copy;
+        struct ocfs2_quota_recovery *quota_rec;
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
@@ -319,6 +326,11 @@ struct ocfs2_super
        struct list_head blocked_lock_list;
        unsigned long blocked_lock_count;
+        /* List of dentry locks to release. Anyone can add locks to
+         * the list, ocfs2_wq processes the list  */
+        struct ocfs2_dentry_lock *dentry_lock_list;
+        struct work_struct dentry_lock_work;
        wait_queue_head_t               osb_mount_event;
        /* Truncate log info */
@@ -333,6 +345,10 @@ struct ocfs2_super
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+                                         struct buffer_head *bh, int type);
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
        if (!S_ISREG(inode->i_mode))
@@ -376,6 +392,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +466,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DINODE(ptr)                                      \
        (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)  do {                    \
-        typeof(__di) ____di = (__di);                                   \
-        ocfs2_error((__sb),                                             \
-                "Dinode # %llu has bad signature %.*s",                 \
-                (unsigned long long)le64_to_cpu((____di)->i_blkno), 7,  \
-                (____di)->i_signature);                                 \
-} while (0)
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)                                \
        (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)    do {            \
-        typeof(__eb) ____eb = (__eb);                                   \
-        ocfs2_error((__sb),                                             \
-                "Extent Block # %llu has bad signature %.*s",           \
-                (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,  \
-                (____eb)->h_signature);                                 \
-} while (0)
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)                                  \
        (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)      do {            \
-        typeof(__gd) ____gd = (__gd);                                   \
-                ocfs2_error((__sb),                                     \
-                "Group Descriptor # %llu has bad signature %.*s",       \
-                (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
-                (____gd)->bg_signature);                                \
-} while (0)
 #define OCFS2_IS_VALID_XATTR_BLOCK(ptr)                                 \
        (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr)                                 \
+        (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
 {
@@ -632,5 +635,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_bit ext2_find_next_bit
 #endif  /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7d..c7ae45aaa36c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
 #define OCFS2_EXTENT_BLOCK_SIGNATURE    "EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE     "XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE     "DIRTRL1"
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)                       \
@@ -93,8 +94,11 @@
                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
-                                         | OCFS2_FEATURE_INCOMPAT_XATTR)
+                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
-#define OCFS2_FEATURE_RO_COMPAT_SUPP    OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
+                                         | OCFS2_FEATURE_INCOMPAT_META_ECC)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 /*
 * Heartbeat-only devices are missing journals and other files.  The
@@ -147,6 +151,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR            0x0200
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC         0x0800
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -163,6 +170,12 @@
 */
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN       0x0001
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA        0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA        0x0004
 /* The byte offset of the first backup block will be 1G.
 * The following will be 4G, 16G, 64G, 256G and 1T.
 */
@@ -192,6 +205,7 @@
 #define OCFS2_HEARTBEAT_FL      (0x00000200)    /* Heartbeat area */
 #define OCFS2_CHAIN_FL          (0x00000400)    /* Chain allocator */
 #define OCFS2_DEALLOC_FL        (0x00000800)    /* Truncate log */
+#define OCFS2_QUOTA_FL          (0x00001000)    /* Quota file */
 /*
 * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
 #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
        HEARTBEAT_SYSTEM_INODE,
        GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+        USER_QUOTA_SYSTEM_INODE,
+        GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
        ORPHAN_DIR_SYSTEM_INODE,
        EXTENT_ALLOC_SYSTEM_INODE,
        INODE_ALLOC_SYSTEM_INODE,
        JOURNAL_SYSTEM_INODE,
        LOCAL_ALLOC_SYSTEM_INODE,
        TRUNCATE_LOG_SYSTEM_INODE,
+        LOCAL_USER_QUOTA_SYSTEM_INODE,
+        LOCAL_GROUP_QUOTA_SYSTEM_INODE,
        NUM_SYSTEM_INODES
 };
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        [SLOT_MAP_SYSTEM_INODE]                 = { "slot_map", 0, S_IFREG | 0644 },
        [HEARTBEAT_SYSTEM_INODE]                = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
        [GLOBAL_BITMAP_SYSTEM_INODE]            = { "global_bitmap", 0, S_IFREG | 0644 },
+        [USER_QUOTA_SYSTEM_INODE]               = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+        [GROUP_QUOTA_SYSTEM_INODE]              = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
        /* Slot-specific system inodes (one copy per slot) */
        [ORPHAN_DIR_SYSTEM_INODE]               = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        [INODE_ALLOC_SYSTEM_INODE]              = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
        [JOURNAL_SYSTEM_INODE]                  = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
        [LOCAL_ALLOC_SYSTEM_INODE]              = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
-        [TRUNCATE_LOG_SYSTEM_INODE]             = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+        [TRUNCATE_LOG_SYSTEM_INODE]             = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+        [LOCAL_USER_QUOTA_SYSTEM_INODE]         = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+        [LOCAL_GROUP_QUOTA_SYSTEM_INODE]        = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 };
 /* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 #define OCFS2_RAW_SB(dinode)            (&((dinode)->id2.i_super))
 /*
+ * Block checking structure.  This is used in metadata to validate the
+ * contents.  If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/  __le32 bc_crc32e;       /* 802.3 Ethernet II CRC32 */
+        __le16 bc_ecc;          /* Single-error-correction parity vector.
+                                   This is a simple Hamming code dependant
+                                   on the blocksize.  OCFS2's maximum
+                                   blocksize, 4K, requires 16 parity bits,
+                                   so we fit in __le16. */
+        __le16 bc_reserved1;
+/*08*/
+};
+/*
 * On disk extent record for OCFS2
 * It describes a range of clusters on disk.
 *
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
 struct ocfs2_extent_block
 {
 /*00*/  __u8 h_signature[8];            /* Signature for verification */
-        __le64 h_reserved1;
+        struct ocfs2_block_check h_check;       /* Error checking */
 /*10*/  __le16 h_suballoc_slot;         /* Slot suballocator this
                                           extent_header belongs to */
        __le16 h_suballoc_bit;          /* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
                                           was set in i_flags */
        __le16 i_dyn_features;
        __le64 i_xattr_loc;
-/*80*/  __le64 i_reserved2[7];
+/*80*/  struct ocfs2_block_check i_check;       /* Error checking */
+/*88*/  __le64 i_reserved2[6];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
 } __attribute__ ((packed));
 /*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/  __le64          db_compat_inode;        /* Always zero. Was inode */
+        __le16          db_compat_rec_len;      /* Backwards compatible with
+                                                 * ocfs2_dir_entry. */
+        __u8            db_compat_name_len;     /* Always zero. Was name_len */
+        __u8            db_reserved0;
+        __le16          db_reserved1;
+        __le16          db_free_rec_len;        /* Size of largest empty hole
+                                                 * in this block. (unused) */
+/*10*/  __u8            db_signature[8];        /* Signature for verification */
+        __le64          db_reserved2;
+        __le64          db_free_next;           /* Next block in list (unused) */
+/*20*/  __le64          db_blkno;               /* Offset on disk, in blocks */
+        __le64          db_parent_dinode;       /* dinode which owns me, in
+                                                   blocks */
+/*30*/  struct ocfs2_block_check db_check;      /* Error checking */
+/*40*/
+};
+/*
 * On disk allocator group structure for OCFS2
 */
 struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
 /*20*/  __le64   bg_parent_dinode;       /* dinode which owns me, in
                                           blocks */
        __le64   bg_blkno;               /* Offset on disk, in blocks */
-/*30*/  __le64   bg_reserved2[2];
+/*30*/  struct ocfs2_block_check bg_check;      /* Error checking */
+        __le64   bg_reserved2;
 /*40*/  __u8    bg_bitmap[0];
 };
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
                                                   in this extent record,
                                                   only valid in the first
                                                   bucket. */
-        __le64  xh_csum;
+        struct ocfs2_block_check xh_check;      /* Error checking
+                                                   (Note, this is only
+                                                    used for xattr
+                                                    buckets.  A block uses
+                                                    xb_check and sets
+                                                    this field to zero.) */
        struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
                                        block group */
        __le32  xb_fs_generation;    /* Must match super block */
 /*10*/  __le64  xb_blkno;            /* Offset on disk, in blocks */
-        __le64  xb_csum;
+        struct ocfs2_block_check xb_check;      /* Error checking */
 /*20*/  __le16  xb_flags;            /* Indicates whether this block contains
                                        real xattr or a xattr tree. */
        __le16  xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
        return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
 }
+/*
+ *  On disk structures for global quota file
+ */
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+        0x0cf52470, /* USRQUOTA */ \
+        0x0cf52471  /* GRPQUOTA */ \
+}
+#define OCFS2_GLOBAL_QVERSIONS {\
+        0, \
+        0, \
+}
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+        __le32 dqh_magic;       /* Magic number identifying file */
+        __le32 dqh_version;     /* Quota format version */
+};
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/  __le32 dqi_bgrace;      /* Grace time for space softlimit excess */
+        __le32 dqi_igrace;      /* Grace time for inode softlimit excess */
+        __le32 dqi_syncms;      /* Time after which we sync local changes to
+                                 * global quota file */
+        __le32 dqi_blocks;      /* Number of blocks in quota file */
+/*10*/  __le32 dqi_free_blk;    /* First free block in quota file */
+        __le32 dqi_free_entry;  /* First block with free dquot entry in quota
+                                 * file */
+};
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/  __le32 dqb_id;          /* ID the structure belongs to */
+        __le32 dqb_use_count;   /* Number of nodes having reference to this structure */
+        __le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+/*10*/  __le64 dqb_isoftlimit;  /* preferred inode limit */
+        __le64 dqb_curinodes;   /* current # allocated inodes */
+/*20*/  __le64 dqb_bhardlimit;  /* absolute limit on disk space */
+        __le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+/*30*/  __le64 dqb_curspace;    /* current space occupied */
+        __le64 dqb_btime;       /* time limit for excessive disk use */
+/*40*/  __le64 dqb_itime;       /* time limit for excessive inode use */
+        __le64 dqb_pad1;
+/*50*/  __le64 dqb_pad2;
+};
+/*
+ *  On-disk structures for local quota file
+ */
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+        0x0cf524c0, /* USRQUOTA */ \
+        0x0cf524c1  /* GRPQUOTA */ \
+}
+#define OCFS2_LOCAL_QVERSIONS {\
+        0, \
+        0, \
+}
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN      0x0001  /* Quota file is empty (this should be after\
+                                 * quota has been cleanly turned off) */
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+        __le32 dqi_flags;       /* Flags for quota file */
+        __le32 dqi_chunks;      /* Number of chunks of quota structures
+                                 * with a bitmap */
+        __le32 dqi_blocks;      /* Number of blocks allocated for quota file */
+};
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+        __le32 dqc_free;        /* Number of free entries in the bitmap */
+        u8 dqc_bitmap[0];       /* Bitmap of entries in the corresponding
+                                 * chunk of quota file */
+};
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/  __le64 dqb_id;          /* id this quota applies to */
+        __le64 dqb_spacemod;    /* Change in the amount of used space */
+/*10*/  __le64 dqb_inodemod;    /* Change in the amount of used inodes */
+};
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+struct ocfs2_disk_dqtrailer {
+/*00*/  struct ocfs2_block_check dq_check;      /* Error checking */
+/*08*/  /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+                                                                 void *buf)
+{
+        char *ptr = buf;
+        ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+        return (struct ocfs2_disk_dqtrailer *)ptr;
+}
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f558..000000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_jbd_compat.h
- *
- * Compatibility defines for JBD.
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#ifndef OCFS2_JBD_COMPAT_H
-#define OCFS2_JBD_COMPAT_H
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# error Should not have been included
-#endif
-struct jbd2_inode {
-        unsigned int dummy;
-};
-#define JBD2_BARRIER                    JFS_BARRIER
-#define JBD2_DEFAULT_MAX_COMMIT_AGE     JBD_DEFAULT_MAX_COMMIT_AGE
-#define jbd2_journal_ack_err                    journal_ack_err
-#define jbd2_journal_clear_err                  journal_clear_err
-#define jbd2_journal_destroy                    journal_destroy
-#define jbd2_journal_dirty_metadata             journal_dirty_metadata
-#define jbd2_journal_errno                      journal_errno
-#define jbd2_journal_extend                     journal_extend
-#define jbd2_journal_flush                      journal_flush
-#define jbd2_journal_force_commit               journal_force_commit
-#define jbd2_journal_get_write_access           journal_get_write_access
-#define jbd2_journal_get_undo_access            journal_get_undo_access
-#define jbd2_journal_init_inode                 journal_init_inode
-#define jbd2_journal_invalidatepage             journal_invalidatepage
-#define jbd2_journal_load                       journal_load
-#define jbd2_journal_lock_updates               journal_lock_updates
-#define jbd2_journal_restart                    journal_restart
-#define jbd2_journal_start                      journal_start
-#define jbd2_journal_start_commit               journal_start_commit
-#define jbd2_journal_stop                       journal_stop
-#define jbd2_journal_try_to_free_buffers        journal_try_to_free_buffers
-#define jbd2_journal_unlock_updates             journal_unlock_updates
-#define jbd2_journal_wipe                       journal_wipe
-#define jbd2_log_wait_commit                    log_wait_commit
-static inline int jbd2_journal_file_inode(handle_t *handle,
-                                          struct jbd2_inode *inode)
-{
-        return 0;
-}
-static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
-                                                      loff_t new_size)
-{
-        return 0;
-}
-static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
-                                               struct inode *inode)
-{
-        return;
-}
-static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
-                                                  struct jbd2_inode *jinode)
-{
-        return;
-}
-#endif  /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f1..eb6f50c9ceca 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_DENTRY,
        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_LOCK_TYPE_FLOCK,
+        OCFS2_LOCK_TYPE_QINFO,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_FLOCK:
                        c = 'F';
                        break;
+                case OCFS2_LOCK_TYPE_QINFO:
+                        c = 'Q';
+                        break;
                default:
                        c = '\0';
        }
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
        [OCFS2_LOCK_TYPE_OPEN] = "Open",
        [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+        [OCFS2_LOCK_TYPE_QINFO] = "Quota",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 000000000000..7365e2e08706
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+#include "ocfs2.h"
+/* Common stuff */
+/* id number of quota format */
+#define QFMT_OCFS2 3
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+        struct dquot dq_dquot;  /* Generic VFS dquot */
+        loff_t dq_local_off;    /* Offset in the local quota file */
+        struct ocfs2_quota_chunk *dq_chunk;     /* Chunk dquot is in */
+        unsigned int dq_use_count;      /* Number of nodes having reference to this entry in global quota file */
+        s64 dq_origspace;       /* Last globally synced space usage */
+        s64 dq_originodes;      /* Last globally synced inode usage */
+};
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+        struct list_head rc_list;       /* List of chunks */
+        int rc_chunk;                   /* Chunk number */
+        unsigned long *rc_bitmap;       /* Bitmap of entries to recover */
+};
+struct ocfs2_quota_recovery {
+        struct list_head r_list[MAXQUOTAS];     /* List of chunks to recover */
+};
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+        unsigned int dqi_type;          /* Quota type this structure describes */
+        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
+        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
+        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
+        unsigned int dqi_syncjiff;      /* Precomputed dqi_syncms in jiffies */
+        struct list_head dqi_chunk;     /* List of chunks */
+        struct inode *dqi_gqinode;      /* Global quota file inode */
+        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
+        struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
+        int dqi_gqi_count;              /* Number of holders of dqi_gqi_bh */
+        struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
+        struct buffer_head *dqi_ibh;    /* Buffer with information header */
+        struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
+        struct delayed_work dqi_sync_work;      /* Work for syncing dquots */
+        struct ocfs2_quota_recovery *dqi_rec;   /* Pointer to recovery
+                                                 * information, in case we
+                                                 * enable quotas on file
+                                                 * needing it */
+};
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+        return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+struct ocfs2_quota_chunk {
+        struct list_head qc_chunk;      /* List of quotafile chunks */
+        int qc_num;                     /* Number of quota chunk */
+        struct buffer_head *qc_headerbh;        /* Buffer head with chunk header */
+};
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+extern struct qtree_fmt_operations ocfs2_global_ops;
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+                                struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+                                struct ocfs2_quota_recovery *rec,
+                                int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+                         size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+                          const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+        return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+        return __ocfs2_sync_dquot(dquot, 1);
+}
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                           struct buffer_head **bh);
+extern struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+int ocfs2_quota_setup(void);
+void ocfs2_quota_shutdown(void);
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000000..1ed0f7c86869
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,862 @@
+/*
+ *  Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "quota.h"
+static struct workqueue_struct *ocfs2_quota_wq = NULL;
+static void qsync_work_fn(struct work_struct *work);
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        /* Update from disk only entries not set by the admin */
+        if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+                m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+                m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+        }
+        if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+                m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+        if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+                m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+                m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+        }
+        if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+                m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+        if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+                m->dqb_btime = le64_to_cpu(d->dqb_btime);
+        if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+                m->dqb_itime = le64_to_cpu(d->dqb_itime);
+        OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        d->dqb_id = cpu_to_le32(dquot->dq_id);
+        d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+        d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+        d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+        d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+        d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+        d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+        d->dqb_btime = cpu_to_le64(m->dqb_btime);
+        d->dqb_itime = cpu_to_le64(m->dqb_itime);
+}
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+                return 0;
+        return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+struct qtree_fmt_operations ocfs2_global_ops = {
+        .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+        .disk2mem_dqblk = ocfs2_global_disk2memdqb,
+        .is_id = ocfs2_global_is_id,
+};
+static int ocfs2_validate_quota_block(struct super_block *sb,
+                                      struct buffer_head *bh)
+{
+        struct ocfs2_disk_dqtrailer *dqt =
+                ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+        mlog(0, "Validating quota block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                           struct buffer_head **bh)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+                                    ocfs2_validate_quota_block);
+        if (rc)
+                mlog_errno(rc);
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
+static int ocfs2_get_quota_block(struct inode *inode, int block,
+                                 struct buffer_head **bh)
+{
+        u64 pblock, pcount;
+        int err;
+        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+        err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
+        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+        if (err) {
+                mlog_errno(err);
+                return err;
+        }
+        *bh = sb_getblk(inode->i_sb, pblock);
+        if (!*bh) {
+                err = -EIO;
+                mlog_errno(err);
+        }
+        return err;;
+}
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+                         size_t len, loff_t off)
+{
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct inode *gqinode = oinfo->dqi_gqinode;
+        loff_t i_size = i_size_read(gqinode);
+        int offset = off & (sb->s_blocksize - 1);
+        sector_t blk = off >> sb->s_blocksize_bits;
+        int err = 0;
+        struct buffer_head *bh;
+        size_t toread, tocopy;
+        if (off > i_size)
+                return 0;
+        if (off + len > i_size)
+                len = i_size - off;
+        toread = len;
+        while (toread > 0) {
+                tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+                bh = NULL;
+                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                if (err) {
+                        mlog_errno(err);
+                        return err;
+                }
+                memcpy(data, bh->b_data + offset, tocopy);
+                brelse(bh);
+                offset = 0;
+                toread -= tocopy;
+                data += tocopy;
+                blk++;
+        }
+        return len;
+}
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+                          const char *data, size_t len, loff_t off)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct inode *gqinode = oinfo->dqi_gqinode;
+        int offset = off & (sb->s_blocksize - 1);
+        sector_t blk = off >> sb->s_blocksize_bits;
+        int err = 0, new = 0, ja_type;
+        struct buffer_head *bh = NULL;
+        handle_t *handle = journal_current_handle();
+        if (!handle) {
+                mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+                     "because transaction was not started.\n",
+                     (unsigned long long)off, (unsigned long long)len);
+                return -EIO;
+        }
+        if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+                WARN_ON(1);
+                len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+        }
+        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
+        if (gqinode->i_size < off + len) {
+                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                err = ocfs2_extend_no_holes(gqinode, off + len, off);
+                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                if (err < 0)
+                        goto out;
+                err = ocfs2_simple_size_update(gqinode,
+                                               oinfo->dqi_gqi_bh,
+                                               off + len);
+                if (err < 0)
+                        goto out;
+                new = 1;
+        }
+        /* Not rewriting whole block? */
+        if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+            !new) {
+                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+        } else {
+                err = ocfs2_get_quota_block(gqinode, blk, &bh);
+                ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+        }
+        if (err) {
+                mlog_errno(err);
+                return err;
+        }
+        lock_buffer(bh);
+        if (new)
+                memset(bh->b_data, 0, sb->s_blocksize);
+        memcpy(bh->b_data + offset, data, len);
+        flush_dcache_page(bh->b_page);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        ocfs2_set_buffer_uptodate(gqinode, bh);
+        err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+        if (err < 0) {
+                brelse(bh);
+                goto out;
+        }
+        err = ocfs2_journal_dirty(handle, bh);
+        brelse(bh);
+        if (err < 0)
+                goto out;
+out:
+        if (err) {
+                mutex_unlock(&gqinode->i_mutex);
+                mlog_errno(err);
+                return err;
+        }
+        gqinode->i_version++;
+        ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+        mutex_unlock(&gqinode->i_mutex);
+        return len;
+}
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        int status;
+        struct buffer_head *bh = NULL;
+        status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+        if (status < 0)
+                return status;
+        spin_lock(&dq_data_lock);
+        if (!oinfo->dqi_gqi_count++)
+                oinfo->dqi_gqi_bh = bh;
+        else
+                WARN_ON(bh != oinfo->dqi_gqi_bh);
+        spin_unlock(&dq_data_lock);
+        return 0;
+}
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+        brelse(oinfo->dqi_gqi_bh);
+        spin_lock(&dq_data_lock);
+        if (!--oinfo->dqi_gqi_count)
+                oinfo->dqi_gqi_bh = NULL;
+        spin_unlock(&dq_data_lock);
+}
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+        struct inode *gqinode = NULL;
+        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+                                        GROUP_QUOTA_SYSTEM_INODE };
+        struct ocfs2_global_disk_dqinfo dinfo;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        int status;
+        mlog_entry_void();
+        /* Read global header */
+        gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+                        OCFS2_INVALID_SLOT);
+        if (!gqinode) {
+                mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+                        type);
+                status = -EINVAL;
+                goto out_err;
+        }
+        oinfo->dqi_gi.dqi_sb = sb;
+        oinfo->dqi_gi.dqi_type = type;
+        ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+        oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+        oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+        oinfo->dqi_gqi_bh = NULL;
+        oinfo->dqi_gqi_count = 0;
+        oinfo->dqi_gqinode = gqinode;
+        status = ocfs2_lock_global_qf(oinfo, 0);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+                                      sizeof(struct ocfs2_global_disk_dqinfo),
+                                      OCFS2_GLOBAL_INFO_OFF);
+        ocfs2_unlock_global_qf(oinfo, 0);
+        if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+                mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+                     status);
+                if (status >= 0)
+                        status = -EIO;
+                mlog_errno(status);
+                goto out_err;
+        }
+        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+        oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+        oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
+        oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+        oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+                                                OCFS2_QBLK_RESERVED_SPACE;
+        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+                           oinfo->dqi_syncjiff);
+out_err:
+        mlog_exit(status);
+        return status;
+}
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_global_disk_dqinfo dinfo;
+        ssize_t size;
+        spin_lock(&dq_data_lock);
+        info->dqi_flags &= ~DQF_INFO_DIRTY;
+        dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+        spin_unlock(&dq_data_lock);
+        dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+        dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+        dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+        dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+                                     sizeof(struct ocfs2_global_disk_dqinfo),
+                                     OCFS2_GLOBAL_INFO_OFF);
+        if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+                mlog(ML_ERROR, "Cannot write global quota info structure\n");
+                if (size >= 0)
+                        size = -EIO;
+                return size;
+        }
+        return 0;
+}
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+        int err;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        err = ocfs2_qinfo_lock(info, 1);
+        if (err < 0)
+                return err;
+        err = __ocfs2_global_write_info(sb, type);
+        ocfs2_qinfo_unlock(info, 1);
+        return err;
+}
+/* Read in information from global quota file and acquire a reference to it.
+ * dquot_acquire() has already started the transaction and locked quota file */
+int ocfs2_global_read_dquot(struct dquot *dquot)
+{
+        int err, err2, ex = 0;
+        struct ocfs2_mem_dqinfo *info =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        err = ocfs2_qinfo_lock(info, 0);
+        if (err < 0)
+                goto out;
+        err = qtree_read_dquot(&info->dqi_gi, dquot);
+        if (err < 0)
+                goto out_qlock;
+        OCFS2_DQUOT(dquot)->dq_use_count++;
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        if (!dquot->dq_off) {   /* No real quota entry? */
+                /* Upgrade to exclusive lock for allocation */
+                err = ocfs2_qinfo_lock(info, 1);
+                if (err < 0)
+                        goto out_qlock;
+                ex = 1;
+        }
+        err = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
+                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
+                if (!err)
+                        err = err2;
+        }
+out_qlock:
+        if (ex)
+                ocfs2_qinfo_unlock(info, 1);
+        ocfs2_qinfo_unlock(info, 0);
+out:
+        if (err < 0)
+                mlog_errno(err);
+        return err;
+}
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+        int err, err2;
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_global_disk_dqblk dqblk;
+        s64 spacechange, inodechange;
+        time_t olditime, oldbtime;
+        err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+                                   sizeof(struct ocfs2_global_disk_dqblk),
+                                   dquot->dq_off);
+        if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+                if (err >= 0) {
+                        mlog(ML_ERROR, "Short read from global quota file "
+                                       "(%u read)\n", err);
+                        err = -EIO;
+                }
+                goto out;
+        }
+        /* Update space and inode usage. Get also other information from
+         * global quota file so that we don't overwrite any changes there.
+         * We are */
+        spin_lock(&dq_data_lock);
+        spacechange = dquot->dq_dqb.dqb_curspace -
+                                        OCFS2_DQUOT(dquot)->dq_origspace;
+        inodechange = dquot->dq_dqb.dqb_curinodes -
+                                        OCFS2_DQUOT(dquot)->dq_originodes;
+        olditime = dquot->dq_dqb.dqb_itime;
+        oldbtime = dquot->dq_dqb.dqb_btime;
+        ocfs2_global_disk2memdqb(dquot, &dqblk);
+        mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+             dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+             dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+        if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+                dquot->dq_dqb.dqb_curspace += spacechange;
+        if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+                dquot->dq_dqb.dqb_curinodes += inodechange;
+        /* Set properly space grace time... */
+        if (dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+                if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+                    oldbtime > 0) {
+                        if (dquot->dq_dqb.dqb_btime > 0)
+                                dquot->dq_dqb.dqb_btime =
+                                        min(dquot->dq_dqb.dqb_btime, oldbtime);
+                        else
+                                dquot->dq_dqb.dqb_btime = oldbtime;
+                }
+        } else {
+                dquot->dq_dqb.dqb_btime = 0;
+                clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+        }
+        /* Set properly inode grace time... */
+        if (dquot->dq_dqb.dqb_isoftlimit &&
+            dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+                if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+                    olditime > 0) {
+                        if (dquot->dq_dqb.dqb_itime > 0)
+                                dquot->dq_dqb.dqb_itime =
+                                        min(dquot->dq_dqb.dqb_itime, olditime);
+                        else
+                                dquot->dq_dqb.dqb_itime = olditime;
+                }
+        } else {
+                dquot->dq_dqb.dqb_itime = 0;
+                clear_bit(DQ_INODES_B, &dquot->dq_flags);
+        }
+        /* All information is properly updated, clear the flags */
+        __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        spin_unlock(&dq_data_lock);
+        err = ocfs2_qinfo_lock(info, freeing);
+        if (err < 0) {
+                mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+                               " (type=%d, id=%u)\n", dquot->dq_type,
+                               (unsigned)dquot->dq_id);
+                goto out;
+        }
+        if (freeing)
+                OCFS2_DQUOT(dquot)->dq_use_count--;
+        err = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (err < 0)
+                goto out_qlock;
+        if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+                err = qtree_release_dquot(&info->dqi_gi, dquot);
+                if (info_dirty(sb_dqinfo(sb, type))) {
+                        err2 = __ocfs2_global_write_info(sb, type);
+                        if (!err)
+                                err = err2;
+                }
+        }
+out_qlock:
+        ocfs2_qinfo_unlock(info, freeing);
+out:
+        if (err < 0)
+                mlog_errno(err);
+        return err;
+}
+/*
+ *  Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+        handle_t *handle;
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        int status = 0;
+        mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+                   dquot->dq_type, type, sb->s_id);
+        if (type != dquot->dq_type)
+                goto out;
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+        status = ocfs2_sync_dquot(dquot);
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+        if (status < 0)
+                mlog_errno(status);
+        /* We have to write local structure as well... */
+        dquot_mark_dquot_dirty(dquot);
+        status = dquot_commit(dquot);
+        if (status < 0)
+                mlog_errno(status);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static void qsync_work_fn(struct work_struct *work)
+{
+        struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+                                                      struct ocfs2_mem_dqinfo,
+                                                      dqi_sync_work.work);
+        struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+                           oinfo->dqi_syncjiff);
+}
+/*
+ *  Wrappers for generic quota functions
+ */
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = dquot_commit(dquot);
+        ocfs2_commit_trans(osb, handle);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo;
+        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+                return 0;
+        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        /* We modify tree, leaf block, global info, local chunk header,
+         * global and local inode */
+        return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
+               2 * OCFS2_INODE_UPDATE_CREDITS;
+}
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb,
+                ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_release(dquot);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo;
+        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+        struct ocfs2_dinode *lfe, *gfe;
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+                return 0;
+        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
+        lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
+        /* We can extend local file + global file. In local file we
+         * can modify info, chunk header block and dquot block. In
+         * global file we can modify info, tree and leaf block */
+        return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
+               ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
+               3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+}
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        /* We need an exclusive lock, because we're going to update use count
+         * and instantiate possibly new dquot structure */
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb,
+                ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_acquire(dquot);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+        unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+        int sync = 0;
+        int status;
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+        dquot_mark_dquot_dirty(dquot);
+        /* In case user set some limits, sync dquot immediately to global
+         * quota file so that information propagates quicker */
+        spin_lock(&dq_data_lock);
+        if (dquot->dq_flags & mask)
+                sync = 1;
+        spin_unlock(&dq_data_lock);
+        /* This is a slight hack but we can't afford getting global quota
+         * lock if we already have a transaction started. */
+        if (!sync || journal_current_handle()) {
+                status = ocfs2_write_dquot(dquot);
+                goto out;
+        }
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = ocfs2_sync_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Now write updated local dquot structure */
+        status = dquot_commit(dquot);
+out_trans:
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+        handle_t *handle;
+        int status = 0;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        mlog_entry_void();
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_commit_info(sb, type);
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+        struct ocfs2_dquot *dquot =
+                                kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+        if (!dquot)
+                return NULL;
+        return &dquot->dq_dquot;
+}
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+        kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+struct dquot_operations ocfs2_quota_operations = {
+        .initialize     = dquot_initialize,
+        .drop           = dquot_drop,
+        .alloc_space    = dquot_alloc_space,
+        .alloc_inode    = dquot_alloc_inode,
+        .free_space     = dquot_free_space,
+        .free_inode     = dquot_free_inode,
+        .transfer       = dquot_transfer,
+        .write_dquot    = ocfs2_write_dquot,
+        .acquire_dquot  = ocfs2_acquire_dquot,
+        .release_dquot  = ocfs2_release_dquot,
+        .mark_dirty     = ocfs2_mark_dquot_dirty,
+        .write_info     = ocfs2_write_info,
+        .alloc_dquot    = ocfs2_alloc_dquot,
+        .destroy_dquot  = ocfs2_destroy_dquot,
+};
+int ocfs2_quota_setup(void)
+{
+        ocfs2_quota_wq = create_workqueue("o2quot");
+        if (!ocfs2_quota_wq)
+                return -ENOMEM;
+        return 0;
+}
+void ocfs2_quota_shutdown(void)
+{
+        if (ocfs2_quota_wq) {
+                flush_workqueue(ocfs2_quota_wq);
+                destroy_workqueue(ocfs2_quota_wq);
+                ocfs2_quota_wq = NULL;
+        }
+}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000000..07deec5e9721
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
+/*
+ *  Implementation of operations over local quota file
+ */
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+        return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+                sizeof(struct ocfs2_local_disk_dqblk));
+}
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+        return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+                 OCFS2_QBLK_RESERVED_SPACE) << 3) /
+               ol_quota_entries_per_block(sb);
+}
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+        return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+        /* 1 block for local quota file info, 1 block per chunk for chunk info */
+        return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+        return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+               ol_dqblk_block_off(sb, c, off);
+}
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+        return off >> sb->s_blocksize_bits;
+}
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+        return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return ((off >> sb->s_blocksize_bits) -
+                        ol_quota_chunk_block(sb, c) - 1) * epb
+               + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+                 sizeof(struct ocfs2_local_disk_dqblk);
+}
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+                void (*modify)(struct buffer_head *, void *), void *private)
+{
+        struct super_block *sb = inode->i_sb;
+        handle_t *handle;
+        int status;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                return status;
+        }
+        status = ocfs2_journal_access_dq(handle, inode, bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                return status;
+        }
+        lock_buffer(bh);
+        modify(bh, private);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                return status;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                return status;
+        }
+        return 0;
+}
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+        unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+        unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+        unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+        unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+                                        GROUP_QUOTA_SYSTEM_INODE };
+        struct buffer_head *bh = NULL;
+        struct inode *linode = sb_dqopt(sb)->files[type];
+        struct inode *ginode = NULL;
+        struct ocfs2_disk_dqheader *dqhead;
+        int status, ret = 0;
+        /* First check whether we understand local quota file */
+        status = ocfs2_read_quota_block(linode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+                        type);
+                goto out_err;
+        }
+        dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+        if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+                mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+                        " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+                        lmagics[type], type);
+                goto out_err;
+        }
+        if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+                mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+                        " type=%d\n", le32_to_cpu(dqhead->dqh_version),
+                        lversions[type], type);
+                goto out_err;
+        }
+        brelse(bh);
+        bh = NULL;
+        /* Next check whether we understand global quota file */
+        ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+                                                OCFS2_INVALID_SLOT);
+        if (!ginode) {
+                mlog(ML_ERROR, "cannot get global quota file inode "
+                                "(type=%d)\n", type);
+                goto out_err;
+        }
+        /* Since the header is read only, we don't care about locking */
+        status = ocfs2_read_quota_block(ginode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read global quota file header "
+                                "(type=%d)\n", type);
+                goto out_err;
+        }
+        dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+        if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+                mlog(ML_ERROR, "global quota file magic does not match "
+                        "(%u != %u), type=%d\n",
+                        le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+                goto out_err;
+        }
+        if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+                mlog(ML_ERROR, "global quota file version does not match "
+                        "(%u != %u), type=%d\n",
+                        le32_to_cpu(dqhead->dqh_version), gversions[type],
+                        type);
+                goto out_err;
+        }
+        ret = 1;
+out_err:
+        brelse(bh);
+        iput(ginode);
+        return ret;
+}
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+        struct ocfs2_quota_chunk *pos, *next;
+        list_for_each_entry_safe(pos, next, head, qc_chunk) {
+                list_del(&pos->qc_chunk);
+                brelse(pos->qc_headerbh);
+                kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+        }
+}
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+                        struct ocfs2_local_disk_dqinfo *ldinfo,
+                        struct list_head *head)
+{
+        struct ocfs2_quota_chunk *newchunk;
+        int i, status;
+        INIT_LIST_HEAD(head);
+        for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+                newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+                if (!newchunk) {
+                        ocfs2_release_local_quota_bitmaps(head);
+                        return -ENOMEM;
+                }
+                newchunk->qc_num = i;
+                newchunk->qc_headerbh = NULL;
+                status = ocfs2_read_quota_block(inode,
+                                ol_quota_chunk_block(inode->i_sb, i),
+                                &newchunk->qc_headerbh);
+                if (status) {
+                        mlog_errno(status);
+                        kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+                        ocfs2_release_local_quota_bitmaps(head);
+                        return status;
+                }
+                list_add_tail(&newchunk->qc_chunk, head);
+        }
+        return 0;
+}
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+        struct mem_dqinfo *info = private;
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                OCFS2_LOCAL_INFO_OFF);
+        spin_lock(&dq_data_lock);
+        ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+        ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+        spin_unlock(&dq_data_lock);
+}
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+                                    struct ocfs2_local_disk_chunk *dchunk,
+                                    int chunk,
+                                    struct list_head *head)
+{
+        struct ocfs2_recovery_chunk *rc;
+        rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+        if (!rc)
+                return -ENOMEM;
+        rc->rc_chunk = chunk;
+        rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+        if (!rc->rc_bitmap) {
+                kfree(rc);
+                return -ENOMEM;
+        }
+        memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+               (ol_chunk_entries(sb) + 7) >> 3);
+        list_add_tail(&rc->rc_list, head);
+        return 0;
+}
+static void free_recovery_list(struct list_head *head)
+{
+        struct ocfs2_recovery_chunk *next;
+        struct ocfs2_recovery_chunk *rchunk;
+        list_for_each_entry_safe(rchunk, next, head, rc_list) {
+                list_del(&rchunk->rc_list);
+                kfree(rchunk->rc_bitmap);
+                kfree(rchunk);
+        }
+}
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+        int type;
+        for (type = 0; type < MAXQUOTAS; type++)
+                free_recovery_list(&(rec->r_list[type]));
+        kfree(rec);
+}
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+                                     struct ocfs2_local_disk_dqinfo *ldinfo,
+                                     int type,
+                                     struct list_head *head)
+{
+        struct super_block *sb = lqinode->i_sb;
+        struct buffer_head *hbh;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+        int status = 0;
+        for (i = 0; i < chunks; i++) {
+                hbh = NULL;
+                status = ocfs2_read_quota_block(lqinode,
+                                                ol_quota_chunk_block(sb, i),
+                                                &hbh);
+                if (status) {
+                        mlog_errno(status);
+                        break;
+                }
+                dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+                if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+                        status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+                brelse(hbh);
+                if (status < 0)
+                        break;
+        }
+        if (status < 0)
+                free_recovery_list(head);
+        return status;
+}
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+        int type;
+        struct ocfs2_quota_recovery *rec;
+        rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+        if (!rec)
+                return NULL;
+        for (type = 0; type < MAXQUOTAS; type++)
+                INIT_LIST_HEAD(&(rec->r_list[type]));
+        return rec;
+}
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+                                                struct ocfs2_super *osb,
+                                                int slot_num)
+{
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        struct super_block *sb = osb->sb;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct inode *lqinode;
+        struct buffer_head *bh;
+        int type;
+        int status = 0;
+        struct ocfs2_quota_recovery *rec;
+        mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+        rec = ocfs2_alloc_quota_recovery();
+        if (!rec)
+                return ERR_PTR(-ENOMEM);
+        /* First init... */
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                /* At this point, journal of the slot is already replayed so
+                 * we can trust metadata and data of the quota file */
+                lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+                if (!lqinode) {
+                        status = -ENOENT;
+                        goto out;
+                }
+                status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+                                               OCFS2_META_LOCK_RECOVERY);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_put;
+                }
+                /* Now read local header */
+                bh = NULL;
+                status = ocfs2_read_quota_block(lqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        mlog(ML_ERROR, "failed to read quota file info header "
+                                "(slot=%d type=%d)\n", slot_num, type);
+                        goto out_lock;
+                }
+                ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                        OCFS2_LOCAL_INFO_OFF);
+                status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+                brelse(bh);
+out_lock:
+                ocfs2_inode_unlock(lqinode, 1);
+out_put:
+                iput(lqinode);
+                if (status < 0)
+                        break;
+        }
+out:
+        if (status < 0) {
+                ocfs2_free_quota_recovery(rec);
+                rec = ERR_PTR(status);
+        }
+        return rec;
+}
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+                                          int type,
+                                          struct ocfs2_quota_recovery *rec)
+{
+        struct super_block *sb = lqinode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_local_disk_chunk *dchunk;
+        struct ocfs2_local_disk_dqblk *dqblk;
+        struct dquot *dquot;
+        handle_t *handle;
+        struct buffer_head *hbh = NULL, *qbh = NULL;
+        int status = 0;
+        int bit, chunk;
+        struct ocfs2_recovery_chunk *rchunk, *next;
+        qsize_t spacechange, inodechange;
+        mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+                chunk = rchunk->rc_chunk;
+                hbh = NULL;
+                status = ocfs2_read_quota_block(lqinode,
+                                                ol_quota_chunk_block(sb, chunk),
+                                                &hbh);
+                if (status) {
+                        mlog_errno(status);
+                        break;
+                }
+                dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+                for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+                        qbh = NULL;
+                        status = ocfs2_read_quota_block(lqinode,
+                                                ol_dqblk_block(sb, chunk, bit),
+                                                &qbh);
+                        if (status) {
+                                mlog_errno(status);
+                                break;
+                        }
+                        dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+                                ol_dqblk_block_off(sb, chunk, bit));
+                        dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+                        if (!dquot) {
+                                status = -EIO;
+                                mlog(ML_ERROR, "Failed to get quota structure "
+                                     "for id %u, type %d. Cannot finish quota "
+                                     "file recovery.\n",
+                                     (unsigned)le64_to_cpu(dqblk->dqb_id),
+                                     type);
+                                goto out_put_bh;
+                        }
+                        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                                                   OCFS2_QSYNC_CREDITS);
+                        if (IS_ERR(handle)) {
+                                status = PTR_ERR(handle);
+                                mlog_errno(status);
+                                goto out_put_dquot;
+                        }
+                        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+                        spin_lock(&dq_data_lock);
+                        /* Add usage from quota entry into quota changes
+                         * of our node. Auxiliary variables are important
+                         * due to signedness */
+                        spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+                        inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+                        dquot->dq_dqb.dqb_curspace += spacechange;
+                        dquot->dq_dqb.dqb_curinodes += inodechange;
+                        spin_unlock(&dq_data_lock);
+                        /* We want to drop reference held by the crashed
+                         * node. Since we have our own reference we know
+                         * global structure actually won't be freed. */
+                        status = ocfs2_global_release_dquot(dquot);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_commit;
+                        }
+                        /* Release local quota file entry */
+                        status = ocfs2_journal_access_dq(handle, lqinode,
+                                        qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_commit;
+                        }
+                        lock_buffer(qbh);
+                        WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+                        le32_add_cpu(&dchunk->dqc_free, 1);
+                        unlock_buffer(qbh);
+                        status = ocfs2_journal_dirty(handle, qbh);
+                        if (status < 0)
+                                mlog_errno(status);
+out_commit:
+                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_put_dquot:
+                        dqput(dquot);
+out_put_bh:
+                        brelse(qbh);
+                        if (status < 0)
+                                break;
+                }
+                brelse(hbh);
+                list_del(&rchunk->rc_list);
+                kfree(rchunk->rc_bitmap);
+                kfree(rchunk);
+                if (status < 0)
+                        break;
+        }
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        if (status < 0)
+                free_recovery_list(&(rec->r_list[type]));
+        mlog_exit(status);
+        return status;
+}
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+                                struct ocfs2_quota_recovery *rec,
+                                int slot_num)
+{
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        struct super_block *sb = osb->sb;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct buffer_head *bh;
+        handle_t *handle;
+        int type;
+        int status = 0;
+        struct inode *lqinode;
+        unsigned int flags;
+        mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (list_empty(&(rec->r_list[type])))
+                        continue;
+                mlog(0, "Recovering quota in slot %d\n", slot_num);
+                lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+                if (!lqinode) {
+                        status = -ENOENT;
+                        goto out;
+                }
+                status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+                                                       OCFS2_META_LOCK_NOQUEUE);
+                /* Someone else is holding the lock? Then he must be
+                 * doing the recovery. Just skip the file... */
+                if (status == -EAGAIN) {
+                        mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+                             "because quota file is locked.\n", slot_num);
+                        status = 0;
+                        goto out_put;
+                } else if (status < 0) {
+                        mlog_errno(status);
+                        goto out_put;
+                }
+                /* Now read local header */
+                bh = NULL;
+                status = ocfs2_read_quota_block(lqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        mlog(ML_ERROR, "failed to read quota file info header "
+                                "(slot=%d type=%d)\n", slot_num, type);
+                        goto out_lock;
+                }
+                ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                        OCFS2_LOCAL_INFO_OFF);
+                /* Is recovery still needed? */
+                flags = le32_to_cpu(ldinfo->dqi_flags);
+                if (!(flags & OLQF_CLEAN))
+                        status = ocfs2_recover_local_quota_file(lqinode,
+                                                                type,
+                                                                rec);
+                /* We don't want to mark file as clean when it is actually
+                 * active */
+                if (slot_num == osb->slot_num)
+                        goto out_bh;
+                /* Mark quota file as clean if we are recovering quota file of
+                 * some other node. */
+                handle = ocfs2_start_trans(osb, 1);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto out_bh;
+                }
+                status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_trans;
+                }
+                lock_buffer(bh);
+                ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+                unlock_buffer(bh);
+                status = ocfs2_journal_dirty(handle, bh);
+                if (status < 0)
+                        mlog_errno(status);
+out_trans:
+                ocfs2_commit_trans(osb, handle);
+out_bh:
+                brelse(bh);
+out_lock:
+                ocfs2_inode_unlock(lqinode, 1);
+out_put:
+                iput(lqinode);
+                if (status < 0)
+                        break;
+        }
+out:
+        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+        kfree(rec);
+        return status;
+}
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        int status;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_quota_recovery *rec;
+        int locked = 0;
+        info->dqi_maxblimit = 0x7fffffffffffffffLL;
+        info->dqi_maxilimit = 0x7fffffffffffffffLL;
+        oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+        if (!oinfo) {
+                mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+                               " info.");
+                goto out_err;
+        }
+        info->dqi_priv = oinfo;
+        oinfo->dqi_type = type;
+        INIT_LIST_HEAD(&oinfo->dqi_chunk);
+        oinfo->dqi_rec = NULL;
+        oinfo->dqi_lqi_bh = NULL;
+        oinfo->dqi_ibh = NULL;
+        status = ocfs2_global_read_info(sb, type);
+        if (status < 0)
+                goto out_err;
+        status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        locked = 1;
+        /* Now read local header */
+        status = ocfs2_read_quota_block(lqinode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read quota file info header "
+                        "(type=%d)\n", type);
+                goto out_err;
+        }
+        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                OCFS2_LOCAL_INFO_OFF);
+        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+        oinfo->dqi_ibh = bh;
+        /* We crashed when using local quota file? */
+        if (!(info->dqi_flags & OLQF_CLEAN)) {
+                rec = OCFS2_SB(sb)->quota_rec;
+                if (!rec) {
+                        rec = ocfs2_alloc_quota_recovery();
+                        if (!rec) {
+                                status = -ENOMEM;
+                                mlog_errno(status);
+                                goto out_err;
+                        }
+                        OCFS2_SB(sb)->quota_rec = rec;
+                }
+                status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_err;
+                }
+        }
+        status = ocfs2_load_local_quota_bitmaps(lqinode,
+                                                ldinfo,
+                                                &oinfo->dqi_chunk);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        /* Now mark quota file as used */
+        info->dqi_flags &= ~OLQF_CLEAN;
+        status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        return 0;
+out_err:
+        if (oinfo) {
+                iput(oinfo->dqi_gqinode);
+                ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+                ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+                brelse(oinfo->dqi_lqi_bh);
+                if (locked)
+                        ocfs2_inode_unlock(lqinode, 1);
+                ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+                kfree(oinfo);
+        }
+        brelse(bh);
+        return -1;
+}
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+                                                ->dqi_ibh;
+        int status;
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+                                 info);
+        if (status < 0) {
+                mlog_errno(status);
+                return -1;
+        }
+        return 0;
+}
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int mark_clean = 1, len;
+        int status;
+        /* At this point we know there are no more dquots and thus
+         * even if there's some sync in the pdflush queue, it won't
+         * find any dquots and return without doing anything */
+        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+        iput(oinfo->dqi_gqinode);
+        ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+        ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+        list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+                dchunk = (struct ocfs2_local_disk_chunk *)
+                                        (chunk->qc_headerbh->b_data);
+                if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+                        len = ol_chunk_entries(sb);
+                } else {
+                        len = (oinfo->dqi_blocks -
+                               ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+                              * ol_quota_entries_per_block(sb);
+                }
+                /* Not all entries free? Bug! */
+                if (le32_to_cpu(dchunk->dqc_free) != len) {
+                        mlog(ML_ERROR, "releasing quota file with used "
+                                        "entries (type=%d)\n", type);
+                        mark_clean = 0;
+                }
+        }
+        ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+        /* dqonoff_mutex protects us against racing with recovery thread... */
+        if (oinfo->dqi_rec) {
+                ocfs2_free_quota_recovery(oinfo->dqi_rec);
+                mark_clean = 0;
+        }
+        if (!mark_clean)
+                goto out;
+        /* Mark local file as clean */
+        info->dqi_flags |= OLQF_CLEAN;
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+                                 oinfo->dqi_ibh,
+                                 olq_update_info,
+                                 info);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+        brelse(oinfo->dqi_ibh);
+        brelse(oinfo->dqi_lqi_bh);
+        kfree(oinfo);
+        return 0;
+}
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+        struct ocfs2_dquot *od = private;
+        struct ocfs2_local_disk_dqblk *dqblk;
+        struct super_block *sb = od->dq_dquot.dq_sb;
+        dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+                + ol_dqblk_block_offset(sb, od->dq_local_off));
+        dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+        spin_lock(&dq_data_lock);
+        dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+                                          od->dq_origspace);
+        dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+                                          od->dq_originodes);
+        spin_unlock(&dq_data_lock);
+        mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+             od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+             (long long)le64_to_cpu(dqblk->dqb_inodemod));
+}
+/* Write dquot to local quota file */
+static int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        struct buffer_head *bh = NULL;
+        int status;
+        status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+                                    ol_dqblk_file_block(sb, od->dq_local_off),
+                                    &bh);
+        if (status) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+                                 olq_set_dquot, od);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        brelse(bh);
+        return status;
+}
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+                                                       int type,
+                                                       int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int found = 0, len;
+        list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+                dchunk = (struct ocfs2_local_disk_chunk *)
+                                                chunk->qc_headerbh->b_data;
+                if (le32_to_cpu(dchunk->dqc_free) > 0) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                return NULL;
+        if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+                len = ol_chunk_entries(sb);
+        } else {
+                len = (oinfo->dqi_blocks -
+                       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+                      * ol_quota_entries_per_block(sb);
+        }
+        found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+        /* We failed? */
+        if (found == len) {
+                mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+                     " entries free (type=%d)\n", chunk->qc_num,
+                     le32_to_cpu(dchunk->dqc_free), type);
+                return ERR_PTR(-EIO);
+        }
+        *offset = found;
+        return chunk;
+}
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+                                                        struct super_block *sb,
+                                                        int type,
+                                                        int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_quota_chunk *chunk = NULL;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int status;
+        handle_t *handle;
+        struct buffer_head *bh = NULL;
+        u64 p_blkno;
+        /* We are protected by dqio_sem so no locking needed */
+        status = ocfs2_extend_no_holes(lqinode,
+                                       lqinode->i_size + 2 * sb->s_blocksize,
+                                       lqinode->i_size);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+                                          lqinode->i_size + 2 * sb->s_blocksize);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+        if (!chunk) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        bh = sb_getblk(sb, p_blkno);
+        if (!bh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(bh);
+        dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+        memset(dchunk->dqc_bitmap, 0,
+               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+               OCFS2_QBLK_RESERVED_SPACE);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        oinfo->dqi_blocks += 2;
+        oinfo->dqi_chunks++;
+        status = ocfs2_local_write_info(sb, type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+        chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+                                   struct ocfs2_quota_chunk,
+                                   qc_chunk)->qc_num + 1;
+        chunk->qc_headerbh = bh;
+        *offset = 0;
+        return chunk;
+out_trans:
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        brelse(bh);
+        kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+        return ERR_PTR(status);
+}
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+                                                       struct super_block *sb,
+                                                       int type,
+                                                       int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_local_disk_chunk *dchunk;
+        int epb = ol_quota_entries_per_block(sb);
+        unsigned int chunk_blocks;
+        int status;
+        handle_t *handle;
+        if (list_empty(&oinfo->dqi_chunk))
+                return ocfs2_local_quota_add_chunk(sb, type, offset);
+        /* Is the last chunk full? */
+        chunk = list_entry(oinfo->dqi_chunk.prev,
+                        struct ocfs2_quota_chunk, qc_chunk);
+        chunk_blocks = oinfo->dqi_blocks -
+                        ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+        if (ol_chunk_blocks(sb) == chunk_blocks)
+                return ocfs2_local_quota_add_chunk(sb, type, offset);
+        /* We are protected by dqio_sem so no locking needed */
+        status = ocfs2_extend_no_holes(lqinode,
+                                       lqinode->i_size + sb->s_blocksize,
+                                       lqinode->i_size);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+                                          lqinode->i_size + sb->s_blocksize);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+                                 OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+        lock_buffer(chunk->qc_headerbh);
+        le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+        unlock_buffer(chunk->qc_headerbh);
+        status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        oinfo->dqi_blocks++;
+        status = ocfs2_local_write_info(sb, type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        *offset = chunk_blocks * epb;
+        return chunk;
+out_trans:
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        return ERR_PTR(status);
+}
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+        int *offset = private;
+        struct ocfs2_local_disk_chunk *dchunk;
+        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+        le32_add_cpu(&dchunk->dqc_free, -1);
+}
+/* Create dquot in the local file for given id */
+static int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        int offset;
+        int status;
+        chunk = ocfs2_find_free_entry(sb, type, &offset);
+        if (!chunk) {
+                chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+                if (IS_ERR(chunk))
+                        return PTR_ERR(chunk);
+        } else if (IS_ERR(chunk)) {
+                return PTR_ERR(chunk);
+        }
+        od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+        od->dq_chunk = chunk;
+        /* Initialize dquot structure on disk */
+        status = ocfs2_local_write_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        /* Mark structure as allocated */
+        status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+                                 &offset);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        return status;
+}
+/* Create entry in local file for dquot, load data from the global file */
+static int ocfs2_local_read_dquot(struct dquot *dquot)
+{
+        int status;
+        mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
+        status = ocfs2_global_read_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        /* Now create entry in the local quota file */
+        status = ocfs2_create_local_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        mlog_exit(0);
+        return 0;
+out_err:
+        mlog_exit(status);
+        return status;
+}
+/* Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and obtained exclusive lock for global
+ * quota file. */
+static int ocfs2_local_release_dquot(struct dquot *dquot)
+{
+        int status;
+        int type = dquot->dq_type;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int offset;
+        handle_t *handle = journal_current_handle();
+        BUG_ON(!handle);
+        /* First write all local changes to global file */
+        status = ocfs2_global_release_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+                        od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+                                             od->dq_local_off);
+        dchunk = (struct ocfs2_local_disk_chunk *)
+                        (od->dq_chunk->qc_headerbh->b_data);
+        /* Mark structure as freed */
+        lock_buffer(od->dq_chunk->qc_headerbh);
+        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+        le32_add_cpu(&dchunk->dqc_free, 1);
+        unlock_buffer(od->dq_chunk->qc_headerbh);
+        status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = 0;
+out:
+        /* Clear the read bit so that next time someone uses this
+         * dquot he reads fresh info from disk and allocates local
+         * dquot structure */
+        clear_bit(DQ_READ_B, &dquot->dq_flags);
+        return status;
+}
+static struct quota_format_ops ocfs2_format_ops = {
+        .check_quota_file       = ocfs2_local_check_quota_file,
+        .read_file_info         = ocfs2_local_read_info,
+        .write_file_info        = ocfs2_global_write_info,
+        .free_file_info         = ocfs2_local_free_info,
+        .read_dqblk             = ocfs2_local_read_dquot,
+        .commit_dqblk           = ocfs2_local_write_dquot,
+        .release_dqblk          = ocfs2_local_release_dquot,
+};
+struct quota_format_type ocfs2_quota_format = {
+        .qf_fmt_id = QFMT_OCFS2,
+        .qf_ops = &ocfs2_format_ops,
+        .qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a7..424adaa5f900 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
        mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
                   new_clusters, first_new_cluster);
-        ret = ocfs2_journal_access(handle, bm_inode, group_bh,
+        ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
        }
        /* update the inode accordingly. */
-        ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
+        ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+         * so any corruption is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
                mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
                goto out_unlock;
        }
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
-                ret = -EIO;
-                goto out_unlock;
-        }
        first_new_cluster = le32_to_cpu(fe->i_clusters);
        lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
                                              first_new_cluster - 1);
-        ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
+        ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+                                          &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_unlock;
        }
        group = (struct ocfs2_group_desc *)group_bh->b_data;
-        ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_unlock;
-        }
        cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
        if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
                le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
                                 struct buffer_head *group_bh)
 {
        int ret;
-        struct ocfs2_group_desc *gd;
+        struct ocfs2_group_desc *gd =
+                (struct ocfs2_group_desc *)group_bh->b_data;
        u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
-        unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
-                                le16_to_cpu(di->id2.i_chain.cl_bpc);
-        gd = (struct ocfs2_group_desc *)group_bh->b_data;
+        ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+        if (ret)
+                goto out;
-        ret = -EIO;
+        ret = -EINVAL;
-        if (!OCFS2_IS_VALID_GROUP_DESC(gd))
+        if (le16_to_cpu(gd->bg_chain) != input->chain)
-                mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno));
-        else if (di->i_blkno != gd->bg_parent_dinode)
-                mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
-                     "pointer (%llu, expected %llu)\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-                     (unsigned long long)le64_to_cpu(di->i_blkno));
-        else if (le16_to_cpu(gd->bg_bits) > max_bits)
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits));
-        else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-                     "claims that %u are free\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits),
-                     le16_to_cpu(gd->bg_free_bits_count));
-        else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-                     "max bitmap bits of %u\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits),
-                     8 * le16_to_cpu(gd->bg_size));
-        else if (le16_to_cpu(gd->bg_chain) != input->chain)
                mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
                     "while input has %u set.\n",
                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
        else
                ret = 0;
+out:
        return ret;
 }
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        cl = &fe->id2.i_chain;
        cr = &cl->cl_recs[input->chain];
-        ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
+        ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
                goto out_commit;
        }
-        ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
+        ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f8508..40661e7824e9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
         * this is not true, the read of -1 (UINT64_MAX) will fail.
         */
        ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
-                                OCFS2_BH_IGNORE_CACHE);
+                                OCFS2_BH_IGNORE_CACHE, NULL);
        if (ret == 0) {
                spin_lock(&osb->osb_lock);
                ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
                bh = NULL;  /* Acquire a fresh bh */
                status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
-                                           OCFS2_BH_IGNORE_CACHE);
+                                           OCFS2_BH_IGNORE_CACHE, NULL);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b57..a69628603e18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
        return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 }
-/* somewhat more expensive than our other checks, so use sparingly. */
+#define do_error(fmt, ...)                                              \
-int ocfs2_check_group_descriptor(struct super_block *sb,
+        do{                                                             \
-                                 struct ocfs2_dinode *di,
+                if (clean_error)                                        \
-                                 struct ocfs2_group_desc *gd)
+                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
+                else                                                    \
+                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
+        } while (0)
+static int ocfs2_validate_gd_self(struct super_block *sb,
+                                  struct buffer_head *bh,
+                                  int clean_error)
 {
-        unsigned int max_bits;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
+                do_error("Group descriptor #%llu has bad signature %.*s",
-                return -EIO;
+                         (unsigned long long)bh->b_blocknr, 7,
+                         gd->bg_signature);
+                return -EINVAL;
        }
+        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+                do_error("Group descriptor #%llu has an invalid bg_blkno "
+                         "of %llu",
+                         (unsigned long long)bh->b_blocknr,
+                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+                do_error("Group descriptor #%llu has an invalid "
+                         "fs_generation of #%u",
+                         (unsigned long long)bh->b_blocknr,
+                         le32_to_cpu(gd->bg_generation));
+                return -EINVAL;
+        }
+        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+                do_error("Group descriptor #%llu has bit count %u but "
+                         "claims that %u are free",
+                         (unsigned long long)bh->b_blocknr,
+                         le16_to_cpu(gd->bg_bits),
+                         le16_to_cpu(gd->bg_free_bits_count));
+                return -EINVAL;
+        }
+        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+                do_error("Group descriptor #%llu has bit count %u but "
+                         "max bitmap bits of %u",
+                         (unsigned long long)bh->b_blocknr,
+                         le16_to_cpu(gd->bg_bits),
+                         8 * le16_to_cpu(gd->bg_size));
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+                                    struct ocfs2_dinode *di,
+                                    struct buffer_head *bh,
+                                    int clean_error)
+{
+        unsigned int max_bits;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (di->i_blkno != gd->bg_parent_dinode) {
-                ocfs2_error(sb, "Group descriptor # %llu has bad parent "
+                do_error("Group descriptor #%llu has bad parent "
-                            "pointer (%llu, expected %llu)",
+                         "pointer (%llu, expected %llu)",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-                            (unsigned long long)le64_to_cpu(di->i_blkno));
+                         (unsigned long long)le64_to_cpu(di->i_blkno));
-                return -EIO;
+                return -EINVAL;
        }
        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
        if (le16_to_cpu(gd->bg_bits) > max_bits) {
-                ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
+                do_error("Group descriptor #%llu has bit count of %u",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            le16_to_cpu(gd->bg_bits));
+                         le16_to_cpu(gd->bg_bits));
-                return -EIO;
+                return -EINVAL;
        }
        if (le16_to_cpu(gd->bg_chain) >=
            le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
-                ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
+                do_error("Group descriptor #%llu has bad chain %u",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            le16_to_cpu(gd->bg_chain));
+                         le16_to_cpu(gd->bg_chain));
-                return -EIO;
+                return -EINVAL;
        }
-        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+        return 0;
-                ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
+}
-                            "claims that %u are free",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                            le16_to_cpu(gd->bg_bits),
-                            le16_to_cpu(gd->bg_free_bits_count));
-                return -EIO;
-        }
-        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+#undef do_error
-                ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-                            "max bitmap bits of %u",
+/*
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ * This version only prints errors.  It does not fail the filesystem, and
-                            le16_to_cpu(gd->bg_bits),
+ * exists only for resize.
-                            8 * le16_to_cpu(gd->bg_size));
+ */
-                return -EIO;
+int ocfs2_check_group_descriptor(struct super_block *sb,
+                                 struct ocfs2_dinode *di,
+                                 struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+        if (rc) {
+                mlog(ML_ERROR,
+                     "Checksum failed for group descriptor %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+        } else
+                rc = ocfs2_validate_gd_self(sb, bh, 1);
+        if (!rc)
+                rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+        return rc;
+}
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+                                           struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        mlog(0, "Validating group descriptor %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+        if (rc)
+                return rc;
+        /*
+         * Errors after here are fatal.
+         */
+        return ocfs2_validate_gd_self(sb, bh, 0);
+}
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+                                u64 gd_blkno, struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+                              ocfs2_validate_group_descriptor);
+        if (rc)
+                goto out;
+        rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+        if (rc) {
+                brelse(tmp);
+                goto out;
        }
-        return 0;
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!*bh)
+                *bh = tmp;
+out:
+        return rc;
 }
 static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
                goto bail;
        }
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_gd(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      bg_bh,
+                                         bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
-        status = ocfs2_journal_access(handle, alloc_inode,
+        status = ocfs2_journal_access_di(handle, alloc_inode,
-                                      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+                                         bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        ac->ac_alloc_slot = slot;
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+        /* The bh was validated by the inode read inside
-                status = -EIO;
+         * ocfs2_inode_lock().  Any corruption is a code bug. */
-                goto bail;
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-        }
        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
                ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
                            (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
        int offset, start, found, status = 0;
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* Callers got this descriptor from
-                OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                return -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-        }
        found = start = best_offset = best_size = 0;
        bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* All callers get the descriptor via
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto bail;
-        }
        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        if (ocfs2_is_cluster_bitmap(alloc_inode))
                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_gd(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      group_bh,
+                                         group_bh,
-                                      journal_type);
+                                         journal_type);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
+        /* The caller got these descriptors from
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto out;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
-        }
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-                status = -EIO;
-                goto out;
-        }
-        if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
-                status = -EIO;
-                goto out;
-        }
        mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
             (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
        bg_ptr = le64_to_cpu(bg->bg_next_group);
        prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
-        status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
                goto out_rollback;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
                goto out_rollback;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
                bg->bg_next_group = cpu_to_le64(bg_ptr);
                prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
        }
-out:
        mlog_exit(status);
        return status;
 }
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        u16 found;
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *gd;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
        struct inode *alloc_inode = ac->ac_inode;
-        ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
+        ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+                                          &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
        gd = (struct ocfs2_group_desc *) group_bh->b_data;
-        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
-                ret = -EIO;
-                goto out;
-        }
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
                                  ac->ac_max_block, bit_off, &found);
        if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
             bits_wanted, chain,
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
-        status = ocfs2_read_block(alloc_inode,
+        status = ocfs2_read_group_descriptor(alloc_inode, fe,
-                                  le64_to_cpu(cl->cl_recs[chain].c_blkno),
+                                             le64_to_cpu(cl->cl_recs[chain].c_blkno),
-                                  &group_bh);
+                                             &group_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        bg = (struct ocfs2_group_desc *) group_bh->b_data;
-        status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = -ENOSPC;
        /* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                next_group = le64_to_cpu(bg->bg_next_group);
                prev_group_bh = group_bh;
                group_bh = NULL;
-                status = ocfs2_read_block(alloc_inode,
+                status = ocfs2_read_group_descriptor(alloc_inode, fe,
-                                          next_group, &group_bh);
+                                                     next_group, &group_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                bg = (struct ocfs2_group_desc *) group_bh->b_data;
-                status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-                if (status) {
-                        mlog_errno(status);
-                        goto bail;
-                }
        }
        if (status < 0) {
                if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        /* Ok, claim our bits now: set the info on dinode, chainlist
         * and then the group */
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_di(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      ac->ac_bh,
+                                         ac->ac_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        BUG_ON(!ac->ac_bh);
        fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
+        /* The bh was validated by the inode read during
-                status = -EIO;
+         * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
-                goto bail;
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-        }
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
                ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* The caller got this descriptor from
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto bail;
-        }
        mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
        if (ocfs2_is_cluster_bitmap(alloc_inode))
                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access(handle, alloc_inode, group_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
-                                      journal_type);
+                                         journal_type);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
+        /* The alloc_bh comes from ocfs2_free_dinode() or
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+         * ocfs2_free_clusters().  The callers have all locked the
-                status = -EIO;
+         * allocator and gotten alloc_bh from the lock call.  This
-                goto bail;
+         * validates the dinode buffer.  Any corruption that has happended
-        }
+         * is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
        mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
             (unsigned long long)bg_blkno, start_bit);
-        status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
+        status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+                                             &group_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        group = (struct ocfs2_group_desc *) group_bh->b_data;
-        status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
        BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
+        status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f450..e3c13c77f9e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 * and return that block offset. */
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
-/* somewhat more expensive than our other checks, so use sparingly. */
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem.  A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function.  This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
 int ocfs2_check_group_descriptor(struct super_block *sb,
                                 struct ocfs2_dinode *di,
-                                 struct ocfs2_group_desc *gd);
+                                 struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+                                u64 gd_blkno, struct buffer_head **bh);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78cf..b1cb38fbe807 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "ocfs1_fs_compat.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
 #include "extent_map.h"
@@ -65,10 +67,13 @@
 #include "uptodate.h"
 #include "ver.h"
 #include "xattr.h"
+#include "quota.h"
 #include "buffer_head_io.h"
 static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
 /* OCFS2 needs to schedule several differnt types of work which
 * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
 static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
 static const struct super_operations ocfs2_sops = {
        .statfs         = ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
        .put_super      = ocfs2_put_super,
        .remount_fs     = ocfs2_remount,
        .show_options   = ocfs2_show_options,
+        .quota_read     = ocfs2_quota_read,
+        .quota_write    = ocfs2_quota_write,
 };
 enum {
@@ -158,6 +168,10 @@ enum {
        Opt_user_xattr,
        Opt_nouser_xattr,
        Opt_inode64,
+        Opt_acl,
+        Opt_noacl,
+        Opt_usrquota,
+        Opt_grpquota,
        Opt_err,
 };
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
        {Opt_user_xattr, "user_xattr"},
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_inode64, "inode64"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_usrquota, "usrquota"},
+        {Opt_grpquota, "grpquota"},
        {Opt_err, NULL}
 };
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
        return 0;
 }
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+            && (ino == USER_QUOTA_SYSTEM_INODE
+                || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+                return 0;
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+            && (ino == GROUP_QUOTA_SYSTEM_INODE
+                || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+                return 0;
+        return 1;
+}
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 {
        struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
             i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+                if (!ocfs2_need_system_inode(osb, i))
+                        continue;
                new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                if (!new) {
                        ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
        for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
             i < NUM_SYSTEM_INODES;
             i++) {
+                if (!ocfs2_need_system_inode(osb, i))
+                        continue;
                new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                if (!new) {
                        ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        /* We're going to/from readonly mode. */
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+                /* Disable quota accounting before remounting RO */
+                if (*flags & MS_RDONLY) {
+                        ret = ocfs2_susp_quotas(osb, 0);
+                        if (ret < 0)
+                                goto out;
+                }
                /* Lock here so the check of HARD_RO and the potential
                 * setting of SOFT_RO is atomic. */
                spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
                }
 unlock_osb:
                spin_unlock(&osb->osb_lock);
+                /* Enable quota accounting after remounting RW */
+                if (!ret && !(*flags & MS_RDONLY)) {
+                        if (sb_any_quota_suspended(sb))
+                                ret = ocfs2_susp_quotas(osb, 1);
+                        else
+                                ret = ocfs2_enable_quotas(osb);
+                        if (ret < 0) {
+                                /* Return back changes... */
+                                spin_lock(&osb->osb_lock);
+                                sb->s_flags |= MS_RDONLY;
+                                osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+                                spin_unlock(&osb->osb_lock);
+                                goto out;
+                        }
+                }
        }
        if (!ret) {
                /* Only save off the new mount options in case of a successful
                 * remount. */
+                if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+                        parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                osb->s_mount_opt = parsed_options.mount_opt;
                osb->s_atime_quantum = parsed_options.atime_quantum;
                osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
        return 0;
 }
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+        int type;
+        struct super_block *sb = osb->sb;
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        int status = 0;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                if (unsuspend)
+                        status = vfs_quota_enable(
+                                        sb_dqopt(sb)->files[type],
+                                        type, QFMT_OCFS2,
+                                        DQUOT_SUSPENDED);
+                else
+                        status = vfs_quota_disable(sb, type,
+                                                   DQUOT_SUSPENDED);
+                if (status < 0)
+                        break;
+        }
+        if (status < 0)
+                mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+                     "remount (error = %d).\n", status);
+        return status;
+}
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+        struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+        struct super_block *sb = osb->sb;
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        int status;
+        int type;
+        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+                                                        osb->slot_num);
+                if (!inode[type]) {
+                        status = -ENOENT;
+                        goto out_quota_off;
+                }
+                status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+                                                DQUOT_USAGE_ENABLED);
+                if (status < 0)
+                        goto out_quota_off;
+        }
+        for (type = 0; type < MAXQUOTAS; type++)
+                iput(inode[type]);
+        return 0;
+out_quota_off:
+        ocfs2_disable_quotas(osb);
+        for (type = 0; type < MAXQUOTAS; type++)
+                iput(inode[type]);
+        mlog_errno(status);
+        return status;
+}
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+        int type;
+        struct inode *inode;
+        struct super_block *sb = osb->sb;
+        /* We mostly ignore errors in this function because there's not much
+         * we can do when we see them */
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!sb_has_quota_loaded(sb, type))
+                        continue;
+                inode = igrab(sb->s_dquot.files[type]);
+                /* Turn off quotas. This will remove all dquot structures from
+                 * memory and so they will be automatically synced to global
+                 * quota files */
+                vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+                                            DQUOT_LIMITS_ENABLED);
+                if (!inode)
+                        continue;
+                iput(inode);
+        }
+}
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+                          char *path, int remount)
+{
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                return -EINVAL;
+        if (remount)
+                return 0;       /* Just ignore it has been handled in
+                                 * ocfs2_remount() */
+        return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
+                                    format_id, DQUOT_LIMITS_ENABLED);
+}
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+{
+        if (remount)
+                return 0;       /* Ignore now and handle later in
+                                 * ocfs2_remount() */
+        return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+static struct quotactl_ops ocfs2_quotactl_ops = {
+        .quota_on       = ocfs2_quota_on,
+        .quota_off      = ocfs2_quota_off,
+        .quota_sync     = vfs_quota_sync,
+        .get_info       = vfs_get_dqinfo,
+        .set_info       = vfs_set_dqinfo,
+        .get_dqblk      = vfs_get_dqblk,
+        .set_dqblk      = vfs_set_dqblk,
+};
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        }
        brelse(bh);
        bh = NULL;
+        if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+                parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
        osb->s_mount_opt = parsed_options.mount_opt;
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
        osb->local_alloc_bits = osb->local_alloc_default_bits;
+        if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                status = -EINVAL;
+                mlog(ML_ERROR, "User quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                goto read_super_error;
+        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                status = -EINVAL;
+                mlog(ML_ERROR, "Group quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                goto read_super_error;
+        }
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = OCFS2_SUPER_MAGIC;
+        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
        /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
         * heartbeat=none */
        if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        atomic_set(&osb->vol_state, VOLUME_MOUNTED);
        wake_up(&osb->osb_mount_event);
+        /* Now we can initialize quotas because we can afford to wait
+         * for cluster locks recovery now. That also means that truncation
+         * log recovery can happen but that waits for proper quota setup */
+        if (!(sb->s_flags & MS_RDONLY)) {
+                status = ocfs2_enable_quotas(osb);
+                if (status < 0) {
+                        /* We have to err-out specially here because
+                         * s_root is already set */
+                        mlog_errno(status);
+                        atomic_set(&osb->vol_state, VOLUME_DISABLED);
+                        wake_up(&osb->osb_mount_event);
+                        mlog_exit(status);
+                        return status;
+                }
+        }
+        ocfs2_complete_quota_recovery(osb);
+        /* Now we wake up again for processes waiting for quotas */
+        atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+        wake_up(&osb->osb_mount_event);
        mlog_exit(status);
        return status;
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
                case Opt_inode64:
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
                        break;
+                case Opt_usrquota:
+                        /* We check only on remount, otherwise features
+                         * aren't yet initialized. */
+                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                            OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                                mlog(ML_ERROR, "User quota requested but "
+                                     "filesystem feature is not set\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+                        break;
+                case Opt_grpquota:
+                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                                mlog(ML_ERROR, "Group quota requested but "
+                                     "filesystem feature is not set\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+                        break;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+                case Opt_acl:
+                        mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+                        break;
+                case Opt_noacl:
+                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+                        break;
+#else
+                case Opt_acl:
+                case Opt_noacl:
+                        printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+                        break;
+#endif
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->osb_cluster_stack[0])
                seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
                           osb->osb_cluster_stack);
+        if (opts & OCFS2_MOUNT_USRQUOTA)
+                seq_printf(s, ",usrquota");
+        if (opts & OCFS2_MOUNT_GRPQUOTA)
+                seq_printf(s, ",grpquota");
        if (opts & OCFS2_MOUNT_NOUSERXATTR)
                seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_INODE64)
                seq_printf(s, ",inode64");
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        if (opts & OCFS2_MOUNT_POSIX_ACL)
+                seq_printf(s, ",acl");
+        else
+                seq_printf(s, ",noacl");
+#endif
        return 0;
 }
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
+        status = ocfs2_quota_setup();
+        if (status)
+                goto leave;
        ocfs2_set_locking_protocol();
+        status = register_quota_format(&ocfs2_quota_format);
 leave:
        if (status < 0) {
+                ocfs2_quota_shutdown();
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
        }
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
 {
        mlog_entry_void();
+        ocfs2_quota_shutdown();
        if (ocfs2_wq) {
                flush_workqueue(ocfs2_wq);
                destroy_workqueue(ocfs2_wq);
        }
+        unregister_quota_format(&ocfs2_quota_format);
        debugfs_remove(ocfs2_debugfs_root);
        ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
                                       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                                SLAB_MEM_SPREAD),
                                       ocfs2_inode_init_once);
-        if (!ocfs2_inode_cachep)
+        ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+                                        sizeof(struct ocfs2_dquot),
+                                        0,
+                                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+                                                SLAB_MEM_SPREAD),
+                                        NULL);
+        ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+                                        sizeof(struct ocfs2_quota_chunk),
+                                        0,
+                                        (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                        NULL);
+        if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+            !ocfs2_qf_chunk_cachep) {
+                if (ocfs2_inode_cachep)
+                        kmem_cache_destroy(ocfs2_inode_cachep);
+                if (ocfs2_dquot_cachep)
+                        kmem_cache_destroy(ocfs2_dquot_cachep);
+                if (ocfs2_qf_chunk_cachep)
+                        kmem_cache_destroy(ocfs2_qf_chunk_cachep);
                return -ENOMEM;
+        }
        return 0;
 }
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
 {
        if (ocfs2_inode_cachep)
                kmem_cache_destroy(ocfs2_inode_cachep);
        ocfs2_inode_cachep = NULL;
+        if (ocfs2_dquot_cachep)
+                kmem_cache_destroy(ocfs2_dquot_cachep);
+        ocfs2_dquot_cachep = NULL;
+        if (ocfs2_qf_chunk_cachep)
+                kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+        ocfs2_qf_chunk_cachep = NULL;
 }
 static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        osb = OCFS2_SB(sb);
        BUG_ON(!osb);
+        ocfs2_disable_quotas(osb);
        ocfs2_shutdown_local_alloc(osb);
        ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
        sb->s_export_op = &ocfs2_export_ops;
+        sb->s_qcop = &ocfs2_quotactl_ops;
+        sb->dq_op = &ocfs2_quota_operations;
        sb->s_xattr = ocfs2_xattr_handlers;
        sb->s_time_gran = 1;
        sb->s_flags |= MS_NOATIME;
@@ -1573,6 +1887,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
        INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
        journal->j_state = OCFS2_JOURNAL_FREE;
+        INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
+        osb->dentry_lock_list = NULL;
        /* get some pseudo constants for clustersize bits */
        osb->s_clustersize_bits =
                le32_to_cpu(di->id2.i_super.s_clustersize_bits);
@@ -1676,6 +1993,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
        if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
                   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+                /* We have to do a raw check of the feature here */
+                if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+                    OCFS2_FEATURE_INCOMPAT_META_ECC) {
+                        status = ocfs2_block_check_validate(bh->b_data,
+                                                            bh->b_size,
+                                                            &di->i_check);
+                        if (status)
+                                goto out;
+                }
                status = -EINVAL;
                if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
                        mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2043,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                }
        }
+out:
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b9..ed0a0cfd68d2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
        mlog_entry_void();
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
+        status = ocfs2_read_inode_block(inode, bh);
        if (status < 0) {
                mlog_errno(status);
                link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade13..915039fffe6e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/security.h>
 #define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
 };
 struct ocfs2_xattr_bucket {
-        struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+        /* The inode these xattrs are associated with */
-        struct ocfs2_xattr_header *xh;
+        struct inode *bu_inode;
+        /* The actual buffers that make up the bucket */
+        struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+        /* How many blocks make up one bucket for this filesystem */
+        int bu_blocks;
+};
+struct ocfs2_xattr_set_ctxt {
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac;
+        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
 };
 #define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE 80
+#define OCFS2_XATTR_FREE_IN_IBODY       (OCFS2_MIN_XATTR_INLINE_SIZE \
+                                         - sizeof(struct ocfs2_xattr_header) \
+                                         - sizeof(__u32))
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr)  ((ptr)->i_sb->s_blocksize \
+                                         - sizeof(struct ocfs2_xattr_block) \
+                                         - sizeof(struct ocfs2_xattr_header) \
+                                         - sizeof(__u32))
 static struct ocfs2_xattr_def_value_root def_xv = {
        .xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        &ocfs2_xattr_acl_access_handler,
+        &ocfs2_xattr_acl_default_handler,
+#endif
        &ocfs2_xattr_trusted_handler,
+        &ocfs2_xattr_security_handler,
        NULL
 };
 static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+                                        = &ocfs2_xattr_acl_access_handler,
+        [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+                                        = &ocfs2_xattr_acl_default_handler,
+#endif
        [OCFS2_XATTR_INDEX_TRUSTED]     = &ocfs2_xattr_trusted_handler,
+        [OCFS2_XATTR_INDEX_SECURITY]    = &ocfs2_xattr_security_handler,
 };
 struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
         */
        struct buffer_head *xattr_bh;
        struct ocfs2_xattr_header *header;
-        struct ocfs2_xattr_bucket bucket;
+        struct ocfs2_xattr_bucket *bucket;
        void *base;
        void *end;
        struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
                                        size_t buffer_size);
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-                                          struct ocfs2_xattr_search *xs);
+                                          struct ocfs2_xattr_search *xs,
+                                          struct ocfs2_xattr_set_ctxt *ctxt);
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
                                             struct ocfs2_xattr_info *xi,
-                                             struct ocfs2_xattr_search *xs);
+                                             struct ocfs2_xattr_search *xs,
+                                             struct ocfs2_xattr_set_ctxt *ctxt);
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
                                          struct buffer_head *xb_bh);
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+                                  u64 src_blk, u64 last_blk, u64 to_blk,
+                                  unsigned int start_bucket,
+                                  u32 *first_hash);
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
        return len / sizeof(struct ocfs2_xattr_entry);
 }
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+        struct ocfs2_xattr_bucket *bucket;
+        int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+        bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+        if (bucket) {
+                bucket->bu_inode = inode;
+                bucket->bu_blocks = blks;
+        }
+        return bucket;
+}
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+        int i;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                brelse(bucket->bu_bhs[i]);
+                bucket->bu_bhs[i] = NULL;
+        }
+}
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+        if (bucket) {
+                ocfs2_xattr_bucket_relse(bucket);
+                bucket->bu_inode = NULL;
+                kfree(bucket);
+        }
+}
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read.  We just need the buffer_heads.  Don't call this for
+ * buckets that are already on disk.  ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+                                   u64 xb_blkno)
+{
+        int i, rc = 0;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+                                              xb_blkno + i);
+                if (!bucket->bu_bhs[i]) {
+                        rc = -EIO;
+                        mlog_errno(rc);
+                        break;
+                }
+                if (!ocfs2_buffer_uptodate(bucket->bu_inode,
+                                           bucket->bu_bhs[i]))
+                        ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+                                                      bucket->bu_bhs[i]);
+        }
+        if (rc)
+                ocfs2_xattr_bucket_relse(bucket);
+        return rc;
+}
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+                                   u64 xb_blkno)
+{
+        int rc;
+        rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+                               bucket->bu_blocks, bucket->bu_bhs, 0,
+                               NULL);
+        if (!rc) {
+                rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+                                                 bucket->bu_bhs,
+                                                 bucket->bu_blocks,
+                                                 &bucket_xh(bucket)->xh_check);
+                if (rc)
+                        mlog_errno(rc);
+        }
+        if (rc)
+                ocfs2_xattr_bucket_relse(bucket);
+        return rc;
+}
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+                                             struct ocfs2_xattr_bucket *bucket,
+                                             int type)
+{
+        int i, rc = 0;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                rc = ocfs2_journal_access(handle, bucket->bu_inode,
+                                          bucket->bu_bhs[i], type);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+        }
+        return rc;
+}
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+                                             struct ocfs2_xattr_bucket *bucket)
+{
+        int i;
+        ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+                                   bucket->bu_bhs, bucket->bu_blocks,
+                                   &bucket_xh(bucket)->xh_check);
+        for (i = 0; i < bucket->bu_blocks; i++)
+                ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+                                         struct ocfs2_xattr_bucket *src)
+{
+        int i;
+        int blocksize = src->bu_inode->i_sb->s_blocksize;
+        BUG_ON(dest->bu_blocks != src->bu_blocks);
+        BUG_ON(dest->bu_inode != src->bu_inode);
+        for (i = 0; i < src->bu_blocks; i++) {
+                memcpy(bucket_block(dest, i), bucket_block(src, i),
+                       blocksize);
+        }
+}
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+                                      struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_xattr_block *xb =
+                (struct ocfs2_xattr_block *)bh->b_data;
+        mlog(0, "Validating xattr block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+        if (rc)
+                return rc;
+        /*
+         * Errors after here are fatal
+         */
+        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has bad "
+                            "signature %.*s",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            xb->xb_signature);
+                return -EINVAL;
+        }
+        if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has an "
+                            "invalid xb_blkno of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(xb->xb_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has an invalid "
+                            "xb_fs_generation of #%u",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(xb->xb_fs_generation));
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+                                  struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+                              ocfs2_validate_xattr_block);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
        struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
        return;
 }
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+        int size = 0;
+        if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+        else
+                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+        size += sizeof(struct ocfs2_xattr_entry);
+        return size;
+}
+int ocfs2_calc_security_init(struct inode *dir,
+                             struct ocfs2_security_xattr_info *si,
+                             int *want_clusters,
+                             int *xattr_credits,
+                             struct ocfs2_alloc_context **xattr_ac)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+                                                 si->value_len);
+        /*
+         * The max space of security xattr taken inline is
+         * 256(name) + 80(value) + 16(entry) = 352 bytes,
+         * So reserve one metadata block for it is ok.
+         */
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+            s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+        /* reserve clusters for xattr value which will be set in B tree*/
+        if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+                                                            si->value_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        return ret;
+}
+int ocfs2_calc_xattr_init(struct inode *dir,
+                          struct buffer_head *dir_bh,
+                          int mode,
+                          struct ocfs2_security_xattr_info *si,
+                          int *want_clusters,
+                          int *xattr_credits,
+                          struct ocfs2_alloc_context **xattr_ac)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+        if (si->enable)
+                s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+                                                     si->value_len);
+        if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+                                        OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+                                        "", NULL, 0);
+                if (acl_len > 0) {
+                        a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+                        if (S_ISDIR(mode))
+                                a_size <<= 1;
+                } else if (acl_len != 0 && acl_len != -ENODATA) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+        }
+        if (!(s_size + a_size))
+                return ret;
+        /*
+         * The max space of security xattr taken inline is
+         * 256(name) + 80(value) + 16(entry) = 352 bytes,
+         * The max space of acl xattr taken inline is
+         * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+         * when blocksize = 512, may reserve one more cluser for
+         * xattr bucket, otherwise reserve one metadata block
+         * for them is ok.
+         */
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+            (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+            (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+                *want_clusters += 1;
+                *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+        }
+        /*
+         * reserve credits and clusters for xattrs which has large value
+         * and have to be set outside
+         */
+        if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+                                                        si->value_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+            acl_len > OCFS2_XATTR_INLINE_SIZE) {
+                /* for directory, it has DEFAULT and ACCESS two types of acls */
+                new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+                                ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        return ret;
+}
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
                                         u32 clusters_to_add,
-                                         struct buffer_head *xattr_bh,
+                                         struct ocfs2_xattr_value_buf *vb,
-                                         struct ocfs2_xattr_value_root *xv)
+                                         struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int status = 0;
-        int restart_func = 0;
+        handle_t *handle = ctxt->handle;
-        int credits = 0;
-        handle_t *handle = NULL;
-        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        enum ocfs2_alloc_restarted why;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+        u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
        struct ocfs2_extent_tree et;
        mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
-        ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+        ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-restart_all:
-        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
-                                       &data_ac, &meta_ac);
-        if (status) {
-                mlog_errno(status);
-                goto leave;
-        }
-        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-                                            clusters_to_add);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(status);
-                goto leave;
-        }
-restarted_transaction:
+        status = vb->vb_access(handle, inode, vb->vb_bh,
-        status = ocfs2_journal_access(handle, inode, xattr_bh,
+                              OCFS2_JOURNAL_ACCESS_WRITE);
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        prev_clusters = le32_to_cpu(xv->xr_clusters);
+        prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
        status = ocfs2_add_clusters_in_btree(osb,
                                             inode,
                                             &logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
                                             0,
                                             &et,
                                             handle,
-                                             data_ac,
+                                             ctxt->data_ac,
-                                             meta_ac,
+                                             ctxt->meta_ac,
                                             &why);
-        if ((status < 0) && (status != -EAGAIN)) {
+        if (status < 0) {
-                if (status != -ENOSPC)
+                mlog_errno(status);
-                        mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, xattr_bh);
+        status = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
-        if (why != RESTART_NONE && clusters_to_add) {
+        /*
-                if (why == RESTART_META) {
+         * We should have already allocated enough space before the transaction,
-                        mlog(0, "restarting function.\n");
+         * so no need to restart.
-                        restart_func = 1;
+         */
-                } else {
+        BUG_ON(why != RESTART_NONE || clusters_to_add);
-                        BUG_ON(why != RESTART_TRANS);
-                        mlog(0, "restarting transaction.\n");
-                        /* TODO: This can be more intelligent. */
-                        credits = ocfs2_calc_extend_credits(osb->sb,
-                                                            et.et_root_el,
-                                                            clusters_to_add);
-                        status = ocfs2_extend_trans(handle, credits);
-                        if (status < 0) {
-                                /* handle still has to be committed at
-                                 * this point. */
-                                status = -ENOMEM;
-                                mlog_errno(status);
-                                goto leave;
-                        }
-                        goto restarted_transaction;
-                }
-        }
 leave:
-        if (handle) {
-                ocfs2_commit_trans(osb, handle);
-                handle = NULL;
-        }
-        if (data_ac) {
-                ocfs2_free_alloc_context(data_ac);
-                data_ac = NULL;
-        }
-        if (meta_ac) {
-                ocfs2_free_alloc_context(meta_ac);
-                meta_ac = NULL;
-        }
-        if ((!status) && restart_func) {
-                restart_func = 0;
-                goto restart_all;
-        }
        return status;
 }
 static int __ocfs2_remove_xattr_range(struct inode *inode,
-                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_buf *vb,
-                                      struct ocfs2_xattr_value_root *xv,
                                      u32 cpos, u32 phys_cpos, u32 len,
-                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret;
        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        handle_t *handle = ctxt->handle;
-        struct inode *tl_inode = osb->osb_tl_inode;
-        handle_t *handle;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        struct ocfs2_extent_tree et;
-        ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+        ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
+                            OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                return ret;
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                ret = __ocfs2_flush_truncate_log(osb);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                  &ctxt->dealloc);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-                                  dealloc);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        le32_add_cpu(&xv->xr_clusters, -len);
+        le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
        if (ret)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
 out:
-        mutex_unlock(&tl_inode->i_mutex);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
        return ret;
 }
 static int ocfs2_xattr_shrink_size(struct inode *inode,
                                   u32 old_clusters,
                                   u32 new_clusters,
-                                   struct buffer_head *root_bh,
+                                   struct ocfs2_xattr_value_buf *vb,
-                                   struct ocfs2_xattr_value_root *xv)
+                                   struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret = 0;
        u32 trunc_len, cpos, phys_cpos, alloc_size;
        u64 block;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_cached_dealloc_ctxt dealloc;
-        ocfs2_init_dealloc_ctxt(&dealloc);
        if (old_clusters <= new_clusters)
                return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
        trunc_len = old_clusters - new_clusters;
        while (trunc_len) {
                ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
-                                               &alloc_size, &xv->xr_list);
+                                               &alloc_size,
+                                               &vb->vb_xv->xr_list);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
                if (alloc_size > trunc_len)
                        alloc_size = trunc_len;
-                ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+                ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
                                                 phys_cpos, alloc_size,
-                                                 &dealloc);
+                                                 ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
        }
 out:
-        ocfs2_schedule_truncate_log_flush(osb, 1);
-        ocfs2_run_deallocs(osb, &dealloc);
        return ret;
 }
 static int ocfs2_xattr_value_truncate(struct inode *inode,
-                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_buf *vb,
-                                      struct ocfs2_xattr_value_root *xv,
+                                      int len,
-                                      int len)
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret;
        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
-        u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+        u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
        if (new_clusters == old_clusters)
                return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
        if (new_clusters > old_clusters)
                ret = ocfs2_xattr_extend_allocation(inode,
                                                    new_clusters - old_clusters,
-                                                    root_bh, xv);
+                                                    vb, ctxt);
        else
                ret = ocfs2_xattr_shrink_size(inode,
                                              old_clusters, new_clusters,
-                                              root_bh, xv);
+                                              vb, ctxt);
        return ret;
 }
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
        if (!di->i_xattr_loc)
                return ret;
-        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+                                     &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto cleanup;
-        }
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
                ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
                ret = ocfs2_xattr_tree_list_index_block(inode, xt,
                                                   buffer, buffer_size);
        }
-cleanup:
        brelse(blk_bh);
        return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
                /* Copy ocfs2_xattr_value */
                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        ret = ocfs2_read_block(inode, blkno, &bh, NULL);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        size_t size;
        int ret = -ENODATA, name_offset, name_len, block_off, i;
-        memset(&xs->bucket, 0, sizeof(xs->bucket));
+        xs->bucket = ocfs2_xattr_bucket_new(inode);
+        if (!xs->bucket) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto cleanup;
+        }
        ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
        if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
                        ret = ocfs2_xattr_bucket_get_name_value(inode,
-                                                                xs->bucket.xh,
+                                                                bucket_xh(xs->bucket),
                                                                i,
                                                                &block_off,
                                                                &name_offset);
-                        xs->base = xs->bucket.bhs[block_off]->b_data;
+                        xs->base = bucket_block(xs->bucket, block_off);
                }
                if (ocfs2_xattr_is_local(xs->here)) {
                        memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        }
        ret = size;
 cleanup:
-        for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+        ocfs2_xattr_bucket_free(xs->bucket);
-                brelse(xs->bucket.bhs[i]);
-        memset(&xs->bucket, 0, sizeof(xs->bucket));
        brelse(xs->xattr_bh);
        xs->xattr_bh = NULL;
        return ret;
 }
-/* ocfs2_xattr_get()
+int ocfs2_xattr_get_nolock(struct inode *inode,
- *
+                           struct buffer_head *di_bh,
- * Copy an extended attribute into the buffer provided.
- * Buffer is NULL to compute the size of buffer required.
- */
-static int ocfs2_xattr_get(struct inode *inode,
                           int name_index,
                           const char *name,
                           void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
 {
        int ret;
        struct ocfs2_dinode *di = NULL;
-        struct buffer_head *di_bh = NULL;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_xattr_search xis = {
                .not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
        if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
                ret = -ENODATA;
-        ret = ocfs2_inode_lock(inode, &di_bh, 0);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
                ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
                                            buffer_size, &xbs);
        up_read(&oi->ip_xattr_sem);
+        return ret;
+}
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+                           int name_index,
+                           const char *name,
+                           void *buffer,
+                           size_t buffer_size)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+                                     name, buffer, buffer_size);
        ocfs2_inode_unlock(inode, 0);
        brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
 }
 static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+                                           handle_t *handle,
                                           struct ocfs2_xattr_value_root *xv,
                                           const void *value,
                                           int value_len)
 {
-        int ret = 0, i, cp_len, credits;
+        int ret = 0, i, cp_len;
        u16 blocksize = inode->i_sb->s_blocksize;
        u32 p_cluster, num_clusters;
        u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
        u64 blkno;
        struct buffer_head *bh = NULL;
-        handle_t *handle;
        BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
-        credits = clusters * bpc;
-        handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
        while (cpos < clusters) {
                ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
                                               &num_clusters, &xv->xr_list);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        ret = ocfs2_read_block(inode, blkno, &bh, NULL);
                        if (ret) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                        ret = ocfs2_journal_dirty(handle, bh);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        brelse(bh);
                        bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                }
                cpos += num_clusters;
        }
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        brelse(bh);
@@ -960,28 +1249,22 @@ out:
 }
 static int ocfs2_xattr_cleanup(struct inode *inode,
+                               handle_t *handle,
                               struct ocfs2_xattr_info *xi,
                               struct ocfs2_xattr_search *xs,
+                               struct ocfs2_xattr_value_buf *vb,
                               size_t offs)
 {
-        handle_t *handle = NULL;
        int ret = 0;
        size_t name_len = strlen(xi->name);
        void *val = xs->base + offs;
        size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
-                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+                            OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        /* Decrease xattr count */
        le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
        memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
        memset(val, 0, size);
-        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        return ret;
 }
 static int ocfs2_xattr_update_entry(struct inode *inode,
+                                    handle_t *handle,
                                    struct ocfs2_xattr_info *xi,
                                    struct ocfs2_xattr_search *xs,
+                                    struct ocfs2_xattr_value_buf *vb,
                                    size_t offs)
 {
-        handle_t *handle = NULL;
+        int ret;
-        int ret = 0;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
-                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+                            OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
                ocfs2_xattr_set_local(xs->here, 0);
        ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
-        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        return ret;
 }
@@ -1045,6 +1318,8 @@ out:
 static int ocfs2_xattr_set_value_outside(struct inode *inode,
                                         struct ocfs2_xattr_info *xi,
                                         struct ocfs2_xattr_search *xs,
+                                         struct ocfs2_xattr_set_ctxt *ctxt,
+                                         struct ocfs2_xattr_value_buf *vb,
                                         size_t offs)
 {
        size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
        xv->xr_list.l_tree_depth = 0;
        xv->xr_list.l_count = cpu_to_le16(1);
        xv->xr_list.l_next_free_rec = 0;
+        vb->vb_xv = xv;
-        ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+        ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
-                                         xi->value_len);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+        ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
-                                              xi->value_len);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+        ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+                                              xi->value, xi->value_len);
        if (ret < 0)
                mlog_errno(ret);
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 static int ocfs2_xattr_set_entry(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt,
                                 int flag)
 {
        struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
        size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
        size_t size_l = 0;
-        handle_t *handle = NULL;
+        handle_t *handle = ctxt->handle;
        int free, i, ret;
        struct ocfs2_xattr_info xi_l = {
                .name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                .value = xi->value,
                .value_len = xi->value_len,
        };
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = xs->xattr_bh,
+                .vb_access = ocfs2_journal_access_di,
+        };
+        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+                BUG_ON(xs->xattr_bh == xs->inode_bh);
+                vb.vb_access = ocfs2_journal_access_xb;
+        } else
+                BUG_ON(xs->xattr_bh != xs->inode_bh);
        /* Compute min_offs, last and free space. */
        last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
                        /* Replace existing local xattr with tree root */
                        ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
-                                                            offs);
+                                                            ctxt, &vb, offs);
                        if (ret < 0)
                                mlog_errno(ret);
                        goto out;
                } else if (!ocfs2_xattr_is_local(xs->here)) {
                        /* For existing xattr which has value outside */
-                        struct ocfs2_xattr_value_root *xv = NULL;
+                        vb.vb_xv = (struct ocfs2_xattr_value_root *)
-                        xv = (struct ocfs2_xattr_value_root *)(val +
+                                (val + OCFS2_XATTR_SIZE(name_len));
-                                OCFS2_XATTR_SIZE(name_len));
                        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
                                /*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                                 * then set new value with set_value_outside().
                                 */
                                ret = ocfs2_xattr_value_truncate(inode,
-                                                                 xs->xattr_bh,
+                                                                 &vb,
-                                                                 xv,
+                                                                 xi->value_len,
-                                                                 xi->value_len);
+                                                                 ctxt);
                                if (ret < 0) {
                                        mlog_errno(ret);
                                        goto out;
                                }
-                                ret = __ocfs2_xattr_set_value_outside(inode,
+                                ret = ocfs2_xattr_update_entry(inode,
-                                                                xv,
+                                                               handle,
-                                                                xi->value,
+                                                               xi,
-                                                                xi->value_len);
+                                                               xs,
+                                                               &vb,
+                                                               offs);
                                if (ret < 0) {
                                        mlog_errno(ret);
                                        goto out;
                                }
-                                ret = ocfs2_xattr_update_entry(inode,
+                                ret = __ocfs2_xattr_set_value_outside(inode,
-                                                               xi,
+                                                                handle,
-                                                               xs,
+                                                                vb.vb_xv,
-                                                               offs);
+                                                                xi->value,
+                                                                xi->value_len);
                                if (ret < 0)
                                        mlog_errno(ret);
                                goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                                 * just trucate old value to zero.
                                 */
                                 ret = ocfs2_xattr_value_truncate(inode,
-                                                                 xs->xattr_bh,
+                                                                  &vb,
-                                                                 xv,
+                                                                  0,
-                                                                 0);
+                                                                  ctxt);
                                if (ret < 0)
                                        mlog_errno(ret);
                        }
                }
        }
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
-                                   OCFS2_INODE_UPDATE_CREDITS);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-                /* set extended attribute in external block. */
+                ret = vb.vb_access(handle, inode, vb.vb_bh,
-                ret = ocfs2_extend_trans(handle,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
-                                         OCFS2_INODE_UPDATE_CREDITS +
-                                         OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
-                ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
        }
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
        }
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
        oi->ip_dyn_features |= flag;
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        /* Update inode ctime */
-        inode->i_ctime = CURRENT_TIME;
-        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
-        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        ret = ocfs2_journal_dirty(handle, xs->inode_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
        if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
                /*
                 * Set value outside in B tree.
                 * This is the second step for value size > INLINE_SIZE.
                 */
                size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
+                                                    &vb, offs);
                if (ret < 0) {
                        int ret2;
@@ -1418,41 +1684,56 @@ out_commit:
                         * If set value outside failed, we have to clean
                         * the junk tree root we have already set in local.
                         */
-                        ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+                        ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
+                                                   xi, xs, &vb, offs);
                        if (ret2 < 0)
                                mlog_errno(ret2);
                }
        }
 out:
        return ret;
 }
 static int ocfs2_remove_value_outside(struct inode*inode,
-                                      struct buffer_head *bh,
+                                      struct ocfs2_xattr_value_buf *vb,
                                      struct ocfs2_xattr_header *header)
 {
        int ret = 0, i;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+        ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+        ctxt.handle = ocfs2_start_trans(osb,
+                                        ocfs2_remove_extent_credits(osb->sb));
+        if (IS_ERR(ctxt.handle)) {
+                ret = PTR_ERR(ctxt.handle);
+                mlog_errno(ret);
+                goto out;
+        }
        for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
                struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
                if (!ocfs2_xattr_is_local(entry)) {
-                        struct ocfs2_xattr_value_root *xv;
                        void *val;
                        val = (void *)header +
                                le16_to_cpu(entry->xe_name_offset);
-                        xv = (struct ocfs2_xattr_value_root *)
+                        vb->vb_xv = (struct ocfs2_xattr_value_root *)
                                (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-                        ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+                        ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                return ret;
+                                break;
                        }
                }
        }
+        ocfs2_commit_trans(osb, ctxt.handle);
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
+out:
        return ret;
 }
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_xattr_header *header;
        int ret;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = di_bh,
+                .vb_access = ocfs2_journal_access_di,
+        };
        header = (struct ocfs2_xattr_header *)
                 ((void *)di + inode->i_sb->s_blocksize -
                 le16_to_cpu(di->i_xattr_inline_size));
-        ret = ocfs2_remove_value_outside(inode, di_bh, header);
+        ret = ocfs2_remove_value_outside(inode, &vb, header);
        return ret;
 }
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 {
        struct ocfs2_xattr_block *xb;
        int ret = 0;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = blk_bh,
+                .vb_access = ocfs2_journal_access_xb,
+        };
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
-                ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+                ret = ocfs2_remove_value_outside(inode, &vb, header);
        } else
                ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
        u64 blk, bg_blkno;
        u16 bit;
-        ret = ocfs2_read_block(inode, block, &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto out;
-        }
        ret = ocfs2_xattr_block_remove(inode, blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
 */
 static int ocfs2_xattr_ibody_set(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
-                                 struct ocfs2_xattr_search *xs)
+                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
                }
        }
-        ret = ocfs2_xattr_set_entry(inode, xi, xs,
+        ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
                                (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
 out:
        up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
        if (!di->i_xattr_loc)
                return ret;
-        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+                                     &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto cleanup;
-        }
        xs->xattr_bh = blk_bh;
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
 */
 static int ocfs2_xattr_block_set(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
-                                 struct ocfs2_xattr_search *xs)
+                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct buffer_head *new_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
-        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle = ctxt->handle;
-        handle_t *handle = NULL;
        struct ocfs2_xattr_block *xblk = NULL;
        u16 suballoc_bit_start;
        u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
        int ret;
        if (!xs->xattr_bh) {
-                /*
+                ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
-                 * Alloc one external block for extended attribute
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
-                 * outside of inode.
-                 */
-                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out;
+                        goto end;
-                }
-                handle = ocfs2_start_trans(osb,
-                                           OCFS2_XATTR_BLOCK_CREATE_CREDITS);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        mlog_errno(ret);
-                        goto out;
-                }
-                ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out_commit;
                }
-                ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+                ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
                                           &suballoc_bit_start, &num_got,
                                           &first_blkno);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                new_bh = sb_getblk(inode->i_sb, first_blkno);
                ocfs2_set_new_buffer_uptodate(inode, new_bh);
-                ret = ocfs2_journal_access(handle, inode, new_bh,
+                ret = ocfs2_journal_access_xb(handle, inode, new_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                /* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                xs->end = (void *)xblk + inode->i_sb->s_blocksize;
                xs->here = xs->header->xh_entries;
                ret = ocfs2_journal_dirty(handle, new_bh);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                di->i_xattr_loc = cpu_to_le64(first_blkno);
-                ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+                ocfs2_journal_dirty(handle, xs->inode_bh);
-                if (ret < 0)
-                        mlog_errno(ret);
-out_commit:
-                ocfs2_commit_trans(osb, handle);
-out:
-                if (meta_ac)
-                        ocfs2_free_alloc_context(meta_ac);
-                if (ret < 0)
-                        return ret;
        } else
                xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
        if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
                /* Set extended attribute into external block */
-                ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+                ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+                                            OCFS2_HAS_XATTR_FL);
                if (!ret || ret != -ENOSPC)
                        goto end;
-                ret = ocfs2_xattr_create_index_block(inode, xs);
+                ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
                if (ret)
                        goto end;
        }
-        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
 end:
        return ret;
 }
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+                                       struct ocfs2_xattr_info *xi,
+                                       struct ocfs2_xattr_search *xs)
+{
+        u64 value_size;
+        struct ocfs2_xattr_entry *last;
+        int free, i;
+        size_t min_offs = xs->end - xs->base;
+        if (!xs->header)
+                return 0;
+        last = xs->header->xh_entries;
+        for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+                size_t offs = le16_to_cpu(last->xe_name_offset);
+                if (offs < min_offs)
+                        min_offs = offs;
+                last += 1;
+        }
+        free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+        if (free < 0)
+                return 0;
+        BUG_ON(!xs->not_found);
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                value_size = OCFS2_XATTR_ROOT_SIZE;
+        else
+                value_size = OCFS2_XATTR_SIZE(xi->value_len);
+        if (free >= sizeof(struct ocfs2_xattr_entry) +
+                   OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+                return 1;
+        return 0;
+}
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+                                     struct ocfs2_dinode *di,
+                                     struct ocfs2_xattr_info *xi,
+                                     struct ocfs2_xattr_search *xis,
+                                     struct ocfs2_xattr_search *xbs,
+                                     int *clusters_need,
+                                     int *meta_need,
+                                     int *credits_need)
+{
+        int ret = 0, old_in_xb = 0;
+        int clusters_add = 0, meta_add = 0, credits = 0;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_xattr_block *xb = NULL;
+        struct ocfs2_xattr_entry *xe = NULL;
+        struct ocfs2_xattr_value_root *xv = NULL;
+        char *base = NULL;
+        int name_offset, name_len = 0;
+        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+                                                    xi->value_len);
+        u64 value_size;
+        /*
+         * Calculate the clusters we need to write.
+         * No matter whether we replace an old one or add a new one,
+         * we need this for writing.
+         */
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                credits += new_clusters *
+                           ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        if (xis->not_found && xbs->not_found) {
+                credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                        clusters_add += new_clusters;
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                        &def_xv.xv.xr_list,
+                                                        new_clusters);
+                }
+                goto meta_guess;
+        }
+        if (!xis->not_found) {
+                xe = xis->here;
+                name_offset = le16_to_cpu(xe->xe_name_offset);
+                name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                base = xis->base;
+                credits += OCFS2_INODE_UPDATE_CREDITS;
+        } else {
+                int i, block_off = 0;
+                xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+                xe = xbs->here;
+                name_offset = le16_to_cpu(xe->xe_name_offset);
+                name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                i = xbs->here - xbs->header->xh_entries;
+                old_in_xb = 1;
+                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+                        ret = ocfs2_xattr_bucket_get_name_value(inode,
+                                                        bucket_xh(xbs->bucket),
+                                                        i, &block_off,
+                                                        &name_offset);
+                        base = bucket_block(xbs->bucket, block_off);
+                        credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                } else {
+                        base = xbs->base;
+                        credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+                }
+        }
+        /*
+         * delete a xattr doesn't need metadata and cluster allocation.
+         * so just calculate the credits and return.
+         *
+         * The credits for removing the value tree will be extended
+         * by ocfs2_remove_extent itself.
+         */
+        if (!xi->value) {
+                if (!ocfs2_xattr_is_local(xe))
+                        credits += ocfs2_remove_extent_credits(inode->i_sb);
+                goto out;
+        }
+        /* do cluster allocation guess first. */
+        value_size = le64_to_cpu(xe->xe_value_size);
+        if (old_in_xb) {
+                /*
+                 * In xattr set, we always try to set the xe in inode first,
+                 * so if it can be inserted into inode successfully, the old
+                 * one will be removed from the xattr block, and this xattr
+                 * will be inserted into inode as a new xattr in inode.
+                 */
+                if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+                        clusters_add += new_clusters;
+                        credits += ocfs2_remove_extent_credits(inode->i_sb) +
+                                    OCFS2_INODE_UPDATE_CREDITS;
+                        if (!ocfs2_xattr_is_local(xe))
+                                credits += ocfs2_calc_extend_credits(
+                                                        inode->i_sb,
+                                                        &def_xv.xv.xr_list,
+                                                        new_clusters);
+                        goto out;
+                }
+        }
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                /* the new values will be stored outside. */
+                u32 old_clusters = 0;
+                if (!ocfs2_xattr_is_local(xe)) {
+                        old_clusters =  ocfs2_clusters_for_bytes(inode->i_sb,
+                                                                 value_size);
+                        xv = (struct ocfs2_xattr_value_root *)
+                             (base + name_offset + name_len);
+                        value_size = OCFS2_XATTR_ROOT_SIZE;
+                } else
+                        xv = &def_xv.xv;
+                if (old_clusters >= new_clusters) {
+                        credits += ocfs2_remove_extent_credits(inode->i_sb);
+                        goto out;
+                } else {
+                        meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+                        clusters_add += new_clusters - old_clusters;
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                             &xv->xr_list,
+                                                             new_clusters -
+                                                             old_clusters);
+                        if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+                                goto out;
+                }
+        } else {
+                /*
+                 * Now the new value will be stored inside. So if the new
+                 * value is smaller than the size of value root or the old
+                 * value, we don't need any allocation, otherwise we have
+                 * to guess metadata allocation.
+                 */
+                if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+                    (!ocfs2_xattr_is_local(xe) &&
+                     OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+                        goto out;
+        }
+meta_guess:
+        /* calculate metadata allocation. */
+        if (di->i_xattr_loc) {
+                if (!xbs->xattr_bh) {
+                        ret = ocfs2_read_xattr_block(inode,
+                                                     le64_to_cpu(di->i_xattr_loc),
+                                                     &bh);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        xb = (struct ocfs2_xattr_block *)bh->b_data;
+                } else
+                        xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+                /*
+                 * If there is already an xattr tree, good, we can calculate
+                 * like other b-trees. Otherwise we may have the chance of
+                 * create a tree, the credit calculation is borrowed from
+                 * ocfs2_calc_extend_credits with root_el = NULL. And the
+                 * new tree will be cluster based, so no meta is needed.
+                 */
+                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+                        struct ocfs2_extent_list *el =
+                                 &xb->xb_attrs.xb_root.xt_list;
+                        meta_add += ocfs2_extend_meta_needed(el);
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                             el, 1);
+                } else
+                        credits += OCFS2_SUBALLOC_ALLOC + 1;
+                /*
+                 * This cluster will be used either for new bucket or for
+                 * new xattr block.
+                 * If the cluster size is the same as the bucket size, one
+                 * more is needed since we may need to extend the bucket
+                 * also.
+                 */
+                clusters_add += 1;
+                credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                if (OCFS2_XATTR_BUCKET_SIZE ==
+                        OCFS2_SB(inode->i_sb)->s_clustersize) {
+                        credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                        clusters_add += 1;
+                }
+        } else {
+                meta_add += 1;
+                credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+out:
+        if (clusters_need)
+                *clusters_need = clusters_add;
+        if (meta_need)
+                *meta_need = meta_add;
+        if (credits_need)
+                *credits_need = credits;
+        brelse(bh);
+        return ret;
+}
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+                                     struct ocfs2_dinode *di,
+                                     struct ocfs2_xattr_info *xi,
+                                     struct ocfs2_xattr_search *xis,
+                                     struct ocfs2_xattr_search *xbs,
+                                     struct ocfs2_xattr_set_ctxt *ctxt,
+                                     int *credits)
+{
+        int clusters_add, meta_add, ret;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+        ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+        ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+                                        &clusters_add, &meta_add, credits);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
+             "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+        if (meta_add) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+                                                        &ctxt->meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (clusters_add) {
+                ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+                if (ret)
+                        mlog_errno(ret);
+        }
+out:
+        if (ret) {
+                if (ctxt->meta_ac) {
+                        ocfs2_free_alloc_context(ctxt->meta_ac);
+                        ctxt->meta_ac = NULL;
+                }
+                /*
+                 * We cannot have an error and a non null ctxt->data_ac.
+                 */
+        }
+        return ret;
+}
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+                                    struct ocfs2_dinode *di,
+                                    struct ocfs2_xattr_info *xi,
+                                    struct ocfs2_xattr_search *xis,
+                                    struct ocfs2_xattr_search *xbs,
+                                    struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int ret = 0, credits, old_found;
+        if (!xi->value) {
+                /* Remove existing extended attribute */
+                if (!xis->not_found)
+                        ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+                else if (!xbs->not_found)
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+        } else {
+                /* We always try to set extended attribute into inode first*/
+                ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+                if (!ret && !xbs->not_found) {
+                        /*
+                         * If succeed and that extended attribute existing in
+                         * external block, then we will remove it.
+                         */
+                        xi->value = NULL;
+                        xi->value_len = 0;
+                        old_found = xis->not_found;
+                        xis->not_found = -ENODATA;
+                        ret = ocfs2_calc_xattr_set_need(inode,
+                                                        di,
+                                                        xi,
+                                                        xis,
+                                                        xbs,
+                                                        NULL,
+                                                        NULL,
+                                                        &credits);
+                        xis->not_found = old_found;
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                        ctxt->handle->h_buffer_credits);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+                } else if (ret == -ENOSPC) {
+                        if (di->i_xattr_loc && !xbs->xattr_bh) {
+                                ret = ocfs2_xattr_block_find(inode,
+                                                             xi->name_index,
+                                                             xi->name, xbs);
+                                if (ret)
+                                        goto out;
+                                old_found = xis->not_found;
+                                xis->not_found = -ENODATA;
+                                ret = ocfs2_calc_xattr_set_need(inode,
+                                                                di,
+                                                                xi,
+                                                                xis,
+                                                                xbs,
+                                                                NULL,
+                                                                NULL,
+                                                                &credits);
+                                xis->not_found = old_found;
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                        ctxt->handle->h_buffer_credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                        }
+                        /*
+                         * If no space in inode, we will set extended attribute
+                         * into external block.
+                         */
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+                        if (ret)
+                                goto out;
+                        if (!xis->not_found) {
+                                /*
+                                 * If succeed and that extended attribute
+                                 * existing in inode, we will remove it.
+                                 */
+                                xi->value = NULL;
+                                xi->value_len = 0;
+                                xbs->not_found = -ENODATA;
+                                ret = ocfs2_calc_xattr_set_need(inode,
+                                                                di,
+                                                                xi,
+                                                                xis,
+                                                                xbs,
+                                                                NULL,
+                                                                NULL,
+                                                                &credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                                ctxt->handle->h_buffer_credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_xattr_ibody_set(inode, xi,
+                                                            xis, ctxt);
+                        }
+                }
+        }
+        if (!ret) {
+                /* Update inode ctime. */
+                ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                inode->i_ctime = CURRENT_TIME;
+                di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+                di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+                ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+        }
+out:
+        return ret;
+}
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+                           struct inode *inode,
+                           struct buffer_head *di_bh,
+                           int name_index,
+                           const char *name,
+                           const void *value,
+                           size_t value_len,
+                           int flags,
+                           struct ocfs2_alloc_context *meta_ac,
+                           struct ocfs2_alloc_context *data_ac)
+{
+        struct ocfs2_dinode *di;
+        int ret;
+        struct ocfs2_xattr_info xi = {
+                .name_index = name_index,
+                .name = name,
+                .value = value,
+                .value_len = value_len,
+        };
+        struct ocfs2_xattr_search xis = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_search xbs = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_set_ctxt ctxt = {
+                .handle = handle,
+                .meta_ac = meta_ac,
+                .data_ac = data_ac,
+        };
+        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+                return -EOPNOTSUPP;
+        /*
+         * In extreme situation, may need xattr bucket when
+         * block size is too small. And we have already reserved
+         * the credits for bucket in mknod.
+         */
+        if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+                xbs.bucket = ocfs2_xattr_bucket_new(inode);
+                if (!xbs.bucket) {
+                        mlog_errno(-ENOMEM);
+                        return -ENOMEM;
+                }
+        }
+        xis.inode_bh = xbs.inode_bh = di_bh;
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        down_write(&OCFS2_I(inode)->ip_xattr_sem);
+        ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+        if (ret)
+                goto cleanup;
+        if (xis.not_found) {
+                ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+                if (ret)
+                        goto cleanup;
+        }
+        ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+cleanup:
+        up_write(&OCFS2_I(inode)->ip_xattr_sem);
+        brelse(xbs.xattr_bh);
+        ocfs2_xattr_bucket_free(xbs.bucket);
+        return ret;
+}
 /*
 * ocfs2_xattr_set()
 *
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
 {
        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
-        int ret;
+        int ret, credits;
-        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
        struct ocfs2_xattr_info xi = {
                .name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
                return -EOPNOTSUPP;
+        /*
+         * Only xbs will be used on indexed trees.  xis doesn't need a
+         * bucket.
+         */
+        xbs.bucket = ocfs2_xattr_bucket_new(inode);
+        if (!xbs.bucket) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret < 0) {
                mlog_errno(ret);
-                return ret;
+                goto cleanup_nolock;
        }
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
                        goto cleanup;
        }
-        if (!value) {
-                /* Remove existing extended attribute */
+        mutex_lock(&tl_inode->i_mutex);
-                if (!xis.not_found)
-                        ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
-                else if (!xbs.not_found)
+                ret = __ocfs2_flush_truncate_log(osb);
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+                if (ret < 0) {
-        } else {
+                        mutex_unlock(&tl_inode->i_mutex);
-                /* We always try to set extended attribute into inode first*/
+                        mlog_errno(ret);
-                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+                        goto cleanup;
-                if (!ret && !xbs.not_found) {
-                        /*
-                         * If succeed and that extended attribute existing in
-                         * external block, then we will remove it.
-                         */
-                        xi.value = NULL;
-                        xi.value_len = 0;
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-                } else if (ret == -ENOSPC) {
-                        if (di->i_xattr_loc && !xbs.xattr_bh) {
-                                ret = ocfs2_xattr_block_find(inode, name_index,
-                                                             name, &xbs);
-                                if (ret)
-                                        goto cleanup;
-                        }
-                        /*
-                         * If no space in inode, we will set extended attribute
-                         * into external block.
-                         */
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-                        if (ret)
-                                goto cleanup;
-                        if (!xis.not_found) {
-                                /*
-                                 * If succeed and that extended attribute
-                                 * existing in inode, we will remove it.
-                                 */
-                                xi.value = NULL;
-                                xi.value_len = 0;
-                                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
-                        }
                }
        }
+        mutex_unlock(&tl_inode->i_mutex);
+        ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+                                        &xbs, &ctxt, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto cleanup;
+        }
+        /* we need to update inode's ctime field, so add credit for it. */
+        credits += OCFS2_INODE_UPDATE_CREDITS;
+        ctxt.handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(ctxt.handle)) {
+                ret = PTR_ERR(ctxt.handle);
+                mlog_errno(ret);
+                goto cleanup;
+        }
+        ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+        ocfs2_commit_trans(osb, ctxt.handle);
+        if (ctxt.data_ac)
+                ocfs2_free_alloc_context(ctxt.data_ac);
+        if (ctxt.meta_ac)
+                ocfs2_free_alloc_context(ctxt.meta_ac);
+        if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+                ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
 cleanup:
        up_write(&OCFS2_I(inode)->ip_xattr_sem);
        ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
        brelse(di_bh);
        brelse(xbs.xattr_bh);
-        for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(xbs.bucket);
-                brelse(xbs.bucket.bhs[i]);
        return ret;
 }
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
                                void *para);
 static int ocfs2_find_xe_in_bucket(struct inode *inode,
-                                   struct buffer_head *header_bh,
+                                   struct ocfs2_xattr_bucket *bucket,
                                   int name_index,
                                   const char *name,
                                   u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
                                   int *found)
 {
        int i, ret = 0, cmp = 1, block_off, new_offset;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
        size_t name_len = strlen(name);
        struct ocfs2_xattr_entry *xe = NULL;
-        struct buffer_head *name_bh = NULL;
        char *xe_name;
        /*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
                        break;
                }
-                ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
-                                       &name_bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        break;
-                }
-                xe_name = name_bh->b_data + new_offset;
-                cmp = memcmp(name, xe_name, name_len);
-                brelse(name_bh);
-                name_bh = NULL;
-                if (cmp == 0) {
+                xe_name = bucket_block(bucket, block_off) + new_offset;
+                if (!memcmp(name, xe_name, name_len)) {
                        *xe_index = i;
                        *found = 1;
                        ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                                   struct ocfs2_xattr_search *xs)
 {
        int ret, found = 0;
-        struct buffer_head *bh = NULL;
-        struct buffer_head *lower_bh = NULL;
        struct ocfs2_xattr_header *xh = NULL;
        struct ocfs2_xattr_entry *xe = NULL;
        u16 index = 0;
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        int low_bucket = 0, bucket, high_bucket;
+        struct ocfs2_xattr_bucket *search;
        u32 last_hash;
-        u64 blkno;
+        u64 blkno, lower_blkno = 0;
+        search = ocfs2_xattr_bucket_new(inode);
+        if (!search) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        ret = ocfs2_read_block(inode, p_blkno, &bh);
+        ret = ocfs2_read_xattr_bucket(search, p_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        xh = bucket_xh(search);
        high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
        while (low_bucket <= high_bucket) {
-                brelse(bh);
+                ocfs2_xattr_bucket_relse(search);
-                bh = NULL;
-                bucket = (low_bucket + high_bucket) / 2;
+                bucket = (low_bucket + high_bucket) / 2;
                blkno = p_blkno + bucket * blk_per_bucket;
+                ret = ocfs2_read_xattr_bucket(search, blkno);
-                ret = ocfs2_read_block(inode, blkno, &bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                xh = (struct ocfs2_xattr_header *)bh->b_data;
+                xh = bucket_xh(search);
                xe = &xh->xh_entries[0];
                if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
                        high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                last_hash = le32_to_cpu(xe->xe_name_hash);
-                /* record lower_bh which may be the insert place. */
+                /* record lower_blkno which may be the insert place. */
-                brelse(lower_bh);
+                lower_blkno = blkno;
-                lower_bh = bh;
-                bh = NULL;
                if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
                        low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                }
                /* the searched xattr should reside in this bucket if exists. */
-                ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+                ret = ocfs2_find_xe_in_bucket(inode, search,
                                              name_index, name, name_hash,
                                              &index, &found);
                if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
         * When the xattr's hash value is in the gap of 2 buckets, we will
         * always set it to the previous bucket.
         */
-        if (!lower_bh) {
+        if (!lower_blkno)
-                /*
+                lower_blkno = p_blkno;
-                 * We can't find any bucket whose first name_hash is less
-                 * than the find name_hash.
+        /* This should be in cache - we just read it during the search */
-                 */
+        ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
-                BUG_ON(bh->b_blocknr != p_blkno);
+        if (ret) {
-                lower_bh = bh;
+                mlog_errno(ret);
-                bh = NULL;
+                goto out;
        }
-        xs->bucket.bhs[0] = lower_bh;
-        xs->bucket.xh = (struct ocfs2_xattr_header *)
-                                        xs->bucket.bhs[0]->b_data;
-        lower_bh = NULL;
-        xs->header = xs->bucket.xh;
+        xs->header = bucket_xh(xs->bucket);
-        xs->base = xs->bucket.bhs[0]->b_data;
+        xs->base = bucket_block(xs->bucket, 0);
        xs->end = xs->base + inode->i_sb->s_blocksize;
        if (found) {
-                /*
-                 * If we have found the xattr enty, read all the blocks in
-                 * this bucket.
-                 */
-                ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
-                                        0);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
                xs->here = &xs->header->xh_entries[index];
                mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-                     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+                     (unsigned long long)bucket_blkno(xs->bucket), index);
        } else
                ret = -ENODATA;
 out:
-        brelse(bh);
+        ocfs2_xattr_bucket_free(search);
-        brelse(lower_bh);
        return ret;
 }
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
                                       xattr_bucket_func *func,
                                       void *para)
 {
-        int i, j, ret = 0;
+        int i, ret = 0;
-        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
        u32 num_buckets = clusters * bpc;
-        struct ocfs2_xattr_bucket bucket;
+        struct ocfs2_xattr_bucket *bucket;
-        memset(&bucket, 0, sizeof(bucket));
+        bucket = ocfs2_xattr_bucket_new(inode);
+        if (!bucket) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
        mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
             clusters, (unsigned long long)blkno);
-        for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+        for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
-                ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+                ret = ocfs2_read_xattr_bucket(bucket, blkno);
-                                        bucket.bhs, 0);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        break;
                }
-                bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
                /*
                 * The real bucket num in this series of blocks is stored
                 * in the 1st bucket.
                 */
                if (i == 0)
-                        num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+                        num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
                mlog(0, "iterating xattr bucket %llu, first hash %u\n",
                     (unsigned long long)blkno,
-                     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+                     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
                if (func) {
-                        ret = func(inode, &bucket, para);
+                        ret = func(inode, bucket, para);
-                        if (ret) {
+                        if (ret)
                                mlog_errno(ret);
-                                break;
+                        /* Fall through to bucket_relse() */
-                        }
                }
-                for (j = 0; j < blk_per_bucket; j++)
+                ocfs2_xattr_bucket_relse(bucket);
-                        brelse(bucket.bhs[j]);
+                if (ret)
-                memset(&bucket, 0, sizeof(bucket));
+                        break;
        }
-out:
+        ocfs2_xattr_bucket_free(bucket);
-        for (j = 0; j < blk_per_bucket; j++)
-                brelse(bucket.bhs[j]);
        return ret;
 }
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
        int i, block_off, new_offset;
        const char *prefix, *name;
-        for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+        for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
-                struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+                struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
                type = ocfs2_xattr_get_type(entry);
                prefix = ocfs2_xattr_prefix(type);
                if (prefix) {
                        ret = ocfs2_xattr_bucket_get_name_value(inode,
-                                                                bucket->xh,
+                                                                bucket_xh(bucket),
                                                                i,
                                                                &block_off,
                                                                &new_offset);
                        if (ret)
                                break;
-                        name = (const char *)bucket->bhs[block_off]->b_data +
+                        name = (const char *)bucket_block(bucket, block_off) +
                                new_offset;
                        ret = ocfs2_xattr_list_entry(xl->buffer,
                                                     xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
 /*
 * When the ocfs2_xattr_block is filled up, new bucket will be created
 * and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end.  This is why *target starts as the last buffer.
 * Note: we need to sort the entries since they are not saved in order
 * in the ocfs2_xattr_block.
 */
 static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
                                           struct buffer_head *xb_bh,
-                                           struct buffer_head *xh_bh,
+                                           struct ocfs2_xattr_bucket *bucket)
-                                           struct buffer_head *data_bh)
 {
        int i, blocksize = inode->i_sb->s_blocksize;
+        int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u16 offset, size, off_change;
        struct ocfs2_xattr_entry *xe;
        struct ocfs2_xattr_block *xb =
                                (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                                (struct ocfs2_xattr_header *)xh_bh->b_data;
        u16 count = le16_to_cpu(xb_xh->xh_count);
-        char *target = xh_bh->b_data, *src = xb_bh->b_data;
+        char *src = xb_bh->b_data;
+        char *target = bucket_block(bucket, blks - 1);
        mlog(0, "cp xattr from block %llu to bucket %llu\n",
             (unsigned long long)xb_bh->b_blocknr,
-             (unsigned long long)xh_bh->b_blocknr);
+             (unsigned long long)bucket_blkno(bucket));
+        for (i = 0; i < blks; i++)
+                memset(bucket_block(bucket, i), 0, blocksize);
-        memset(xh_bh->b_data, 0, blocksize);
-        if (data_bh)
-                memset(data_bh->b_data, 0, blocksize);
        /*
         * Since the xe_name_offset is based on ocfs2_xattr_header,
         * there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        size = blocksize - offset;
        /* copy all the names and values. */
-        if (data_bh)
-                target = data_bh->b_data;
        memcpy(target + offset, src + offset, size);
        /* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
        /* copy all the entries. */
-        target = xh_bh->b_data;
+        target = bucket_block(bucket, 0);
        offset = offsetof(struct ocfs2_xattr_header, xh_entries);
        size = count * sizeof(struct ocfs2_xattr_entry);
        memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 * While if the entry is in index b-tree, "bucket" indicates the
 * real place of the xattr.
 */
-static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
-                                           struct ocfs2_xattr_search *xs,
+                                            struct ocfs2_xattr_search *xs,
-                                           struct buffer_head *old_bh,
+                                            struct buffer_head *old_bh)
-                                           struct buffer_head *new_bh)
 {
-        int ret = 0;
        char *buf = old_bh->b_data;
        struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
        struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
-        int i, blocksize = inode->i_sb->s_blocksize;
+        int i;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        xs->bucket.bhs[0] = new_bh;
-        get_bh(new_bh);
-        xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
-        xs->header = xs->bucket.xh;
-        xs->base = new_bh->b_data;
+        xs->header = bucket_xh(xs->bucket);
+        xs->base = bucket_block(xs->bucket, 0);
        xs->end = xs->base + inode->i_sb->s_blocksize;
-        if (!xs->not_found) {
+        if (xs->not_found)
-                if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+                return;
-                        ret = ocfs2_read_blocks(inode,
-                                        xs->bucket.bhs[0]->b_blocknr + 1,
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
-                                        0);
-                        if (ret) {
-                                mlog_errno(ret);
-                                return ret;
-                        }
-                }
-                i = xs->here - old_xh->xh_entries;
-                xs->here = &xs->header->xh_entries[i];
-        }
-        return ret;
+        i = xs->here - old_xh->xh_entries;
+        xs->here = &xs->header->xh_entries[i];
 }
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-                                          struct ocfs2_xattr_search *xs)
+                                          struct ocfs2_xattr_search *xs,
+                                          struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int ret, credits = OCFS2_SUBALLOC_ALLOC;
+        int ret;
        u32 bit_off, len;
        u64 blkno;
-        handle_t *handle;
+        handle_t *handle = ctxt->handle;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_alloc_context *data_ac;
-        struct buffer_head *xh_bh = NULL, *data_bh = NULL;
        struct buffer_head *xb_bh = xs->xattr_bh;
        struct ocfs2_xattr_block *xb =
                        (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_tree_root *xr;
        u16 xb_flags = le16_to_cpu(xb->xb_flags);
-        u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        mlog(0, "create xattr index block for %llu\n",
             (unsigned long long)xb_bh->b_blocknr);
        BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+        BUG_ON(!xs->bucket);
-        ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        /*
         * XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
         */
        down_write(&oi->ip_alloc_sem);
-        /*
+        ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
-         * 3 more credits, one for xattr block update, one for the 1st block
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-         * of the new xattr bucket and one for the value/data.
-         */
-        credits += 3;
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out_sem;
-        }
-        ret = ocfs2_journal_access(handle, inode, xb_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+                                     1, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        /*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        mlog(0, "allocate 1 cluster from %llu to xattr block\n",
             (unsigned long long)blkno);
-        xh_bh = sb_getblk(inode->i_sb, blkno);
+        ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
-        if (!xh_bh) {
+        if (ret) {
-                ret = -EIO;
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+                                                OCFS2_JOURNAL_ACCESS_CREATE);
-        ret = ocfs2_journal_access(handle, inode, xh_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
-        }
-        if (bpb > 1) {
-                data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
-                if (!data_bh) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
-                ocfs2_set_new_buffer_uptodate(inode, data_bh);
-                ret = ocfs2_journal_access(handle, inode, data_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
        }
-        ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+        ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        ocfs2_journal_dirty(handle, xh_bh);
-        if (data_bh)
-                ocfs2_journal_dirty(handle, data_bh);
-        ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+        ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
        memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
-        ret = ocfs2_journal_dirty(handle, xb_bh);
+        ocfs2_journal_dirty(handle, xb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-out_commit:
-        ocfs2_commit_trans(osb, handle);
-out_sem:
-        up_write(&oi->ip_alloc_sem);
 out:
-        if (data_ac)
+        up_write(&oi->ip_alloc_sem);
-                ocfs2_free_alloc_context(data_ac);
-        brelse(xh_bh);
-        brelse(data_bh);
        return ret;
 }
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
 * so that we can spare some space for insertion.
 */
 static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+                                     handle_t *handle,
                                     struct ocfs2_xattr_bucket *bucket)
 {
        int ret, i;
        size_t end, offset, len, value_len;
        struct ocfs2_xattr_header *xh;
        char *entries, *buf, *bucket_buf = NULL;
-        u64 blkno = bucket->bhs[0]->b_blocknr;
+        u64 blkno = bucket_blkno(bucket);
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u16 xh_free_start;
        size_t blocksize = inode->i_sb->s_blocksize;
-        handle_t *handle;
-        struct buffer_head **bhs;
        struct ocfs2_xattr_entry *xe;
-        bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-                        GFP_NOFS);
-        if (!bhs)
-                return -ENOMEM;
-        ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
-        if (ret)
-                goto out;
        /*
         * In order to make the operation more efficient and generic,
         * we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
        }
        buf = bucket_buf;
-        for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+        for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
-                memcpy(buf, bhs[i]->b_data, blocksize);
+                memcpy(buf, bucket_block(bucket, i), blocksize);
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+        ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
+        if (ret < 0) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                ret = ocfs2_journal_access(handle, inode, bhs[i],
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto commit;
-                }
-        }
        xh = (struct ocfs2_xattr_header *)bucket_buf;
        entries = (char *)xh->xh_entries;
        xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
                        "bucket %llu\n", (unsigned long long)blkno);
        if (xh_free_start == end)
-                goto commit;
+                goto out;
        memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
        xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
             cmp_xe, swap_xe);
        buf = bucket_buf;
-        for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+        for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
-                memcpy(bhs[i]->b_data, buf, blocksize);
+                memcpy(bucket_block(bucket, i), buf, blocksize);
-                ocfs2_journal_dirty(handle, bhs[i]);
+        ocfs2_xattr_bucket_journal_dirty(handle, bucket);
-        }
-commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-        if (bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(bhs[i]);
-        }
-        kfree(bhs);
        kfree(bucket_buf);
        return ret;
 }
 /*
- * Move half nums of the xattr bucket in the previous cluster to this new
+ * prev_blkno points to the start of an existing extent.  new_blkno
- * cluster. We only touch the last cluster of the previous extend record.
+ * points to a newly allocated extent.  Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary.  So we take the last cluster of the existing
+ * extent and split it down the middle.  We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count.  header_bh is the bucket were we were hoping
+ * to insert our xattr.  If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
 *
- * first_bh is the first buffer_head of a series of bucket in the same
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
- * extent rec and header_bh is the header of one bucket in this cluster.
- * They will be updated if we move the data header_bh contains to the new
- * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
 */
 static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
                                               handle_t *handle,
-                                               struct buffer_head **first_bh,
+                                               struct ocfs2_xattr_bucket *first,
-                                               struct buffer_head **header_bh,
+                                               struct ocfs2_xattr_bucket *target,
                                               u64 new_blkno,
-                                               u64 prev_blkno,
                                               u32 num_clusters,
                                               u32 *first_hash)
 {
-        int i, ret, credits;
+        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct super_block *sb = inode->i_sb;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
-        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+        int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
-        int blocksize = inode->i_sb->s_blocksize;
+        int to_move = num_buckets / 2;
-        struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+        u64 src_blkno;
-        struct ocfs2_xattr_header *new_xh;
+        u64 last_cluster_blkno = bucket_blkno(first) +
-        struct ocfs2_xattr_header *xh =
+                ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
-                        (struct ocfs2_xattr_header *)((*first_bh)->b_data);
-        BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
-        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
-        prev_bh = *first_bh;
-        get_bh(prev_bh);
-        xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
-        prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+        BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
        mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-             (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
+             (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
-        /*
+        ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
-         * We need to update the 1st half of the new cluster and
+                                     last_cluster_blkno, new_blkno,
-         * 1 more for the update of the 1st bucket of the previous
+                                     to_move, first_hash);
-         * extent record.
-         */
-        credits = bpc / 2 + 1;
-        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, prev_bh,
+        /* This is the first bucket that got moved */
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+        /*
-                old_bh = new_bh = NULL;
+         * If the target bucket was part of the moved buckets, we need to
-                new_bh = sb_getblk(inode->i_sb, new_blkno);
+         * update first and target.
-                if (!new_bh) {
+         */
-                        ret = -EIO;
+        if (bucket_blkno(target) >= src_blkno) {
-                        mlog_errno(ret);
+                /* Find the block for the new target bucket */
-                        goto out;
+                src_blkno = new_blkno +
-                }
+                        (bucket_blkno(target) - src_blkno);
-                ocfs2_set_new_buffer_uptodate(inode, new_bh);
+                ocfs2_xattr_bucket_relse(first);
+                ocfs2_xattr_bucket_relse(target);
-                ret = ocfs2_journal_access(handle, inode, new_bh,
+                /*
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                 * These shouldn't fail - the buffers are in the
-                if (ret < 0) {
+                 * journal from ocfs2_cp_xattr_bucket().
+                 */
+                ret = ocfs2_read_xattr_bucket(first, new_blkno);
+                if (ret) {
                        mlog_errno(ret);
-                        brelse(new_bh);
                        goto out;
                }
+                ret = ocfs2_read_xattr_bucket(target, src_blkno);
-                ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+                if (ret)
-                if (ret < 0) {
                        mlog_errno(ret);
-                        brelse(new_bh);
-                        goto out;
-                }
-                memcpy(new_bh->b_data, old_bh->b_data, blocksize);
-                if (i == 0) {
-                        new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
-                        new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
-                        if (first_hash)
-                                *first_hash = le32_to_cpu(
-                                        new_xh->xh_entries[0].xe_name_hash);
-                        new_first_bh = new_bh;
-                        get_bh(new_first_bh);
-                }
-                ocfs2_journal_dirty(handle, new_bh);
-                if (*header_bh == old_bh) {
-                        brelse(*header_bh);
-                        *header_bh = new_bh;
-                        get_bh(*header_bh);
-                        brelse(*first_bh);
-                        *first_bh = new_first_bh;
-                        get_bh(*first_bh);
-                }
-                brelse(new_bh);
-                brelse(old_bh);
        }
-        le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
-        ocfs2_journal_dirty(handle, prev_bh);
 out:
-        brelse(prev_bh);
-        brelse(new_first_bh);
-        return ret;
-}
-static int ocfs2_read_xattr_bucket(struct inode *inode,
-                                   u64 blkno,
-                                   struct buffer_head **bhs,
-                                   int new)
-{
-        int ret = 0;
-        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        if (!new)
-                return ocfs2_read_blocks(inode, blkno,
-                                         blk_per_bucket, bhs, 0);
-        for (i = 0; i < blk_per_bucket; i++) {
-                bhs[i] = sb_getblk(inode->i_sb, blkno + i);
-                if (bhs[i] == NULL) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        break;
-                }
-                ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-        }
        return ret;
 }
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 {
        int ret, i;
        int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
-        struct buffer_head **s_bhs, **t_bhs = NULL;
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
        int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        mlog(0, "move some of xattrs from bucket %llu to %llu\n",
             (unsigned long long)blk, (unsigned long long)new_blk);
-        s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        s_bucket = ocfs2_xattr_bucket_new(inode);
-        if (!s_bhs)
+        t_bucket = ocfs2_xattr_bucket_new(inode);
-                return -ENOMEM;
+        if (!s_bucket || !t_bucket) {
+                ret = -ENOMEM;
-        ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
-        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+        ret = ocfs2_read_xattr_bucket(s_bucket, blk);
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
-        if (!t_bhs) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = -ENOMEM;
+        if (ret) {
+                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+        /*
+         * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
+         * there's no need to read it.
+         */
+        ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
+        /*
-                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+         * Hey, if we're overwriting t_bucket, what difference does
-                                           new_bucket_head ?
+         * ACCESS_CREATE vs ACCESS_WRITE make?  See the comment in the
-                                           OCFS2_JOURNAL_ACCESS_CREATE :
+         * same part of ocfs2_cp_xattr_bucket().
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+         */
-                if (ret) {
+        ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
-                        mlog_errno(ret);
+                                                new_bucket_head ?
-                        goto out;
+                                                OCFS2_JOURNAL_ACCESS_CREATE :
-                }
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        xh = bucket_xh(s_bucket);
        count = le16_to_cpu(xh->xh_count);
        start = ocfs2_xattr_find_divide_pos(xh);
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
                 * The hash value is set as one larger than
                 * that of the last entry in the previous bucket.
                 */
-                for (i = 0; i < blk_per_bucket; i++)
+                for (i = 0; i < t_bucket->bu_blocks; i++)
-                        memset(t_bhs[i]->b_data, 0, blocksize);
+                        memset(bucket_block(t_bucket, i), 0, blocksize);
-                xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+                xh = bucket_xh(t_bucket);
                xh->xh_free_start = cpu_to_le16(blocksize);
                xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
                le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        }
        /* copy the whole bucket to the new first. */
-        for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
-                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
        /* update the new bucket. */
-        xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+        xh = bucket_xh(t_bucket);
        /*
         * Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
        else
                xh->xh_num_buckets = 0;
-        for (i = 0; i < blk_per_bucket; i++) {
+        ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
-                ocfs2_journal_dirty(handle, t_bhs[i]);
-                if (ret)
-                        mlog_errno(ret);
-        }
        /* store the first_hash of the new bucket. */
        if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
        if (start == count)
                goto out;
-        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        xh = bucket_xh(s_bucket);
        memset(&xh->xh_entries[start], 0,
               sizeof(struct ocfs2_xattr_entry) * (count - start));
        xh->xh_count = cpu_to_le16(start);
        xh->xh_free_start = cpu_to_le16(name_offset);
        xh->xh_name_value_len = cpu_to_le16(name_value_len);
-        ocfs2_journal_dirty(handle, s_bhs[0]);
+        ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
-        if (ret)
-                mlog_errno(ret);
 out:
-        if (s_bhs) {
+        ocfs2_xattr_bucket_free(s_bucket);
-                for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(t_bucket);
-                        brelse(s_bhs[i]);
-        }
-        kfree(s_bhs);
-        if (t_bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(t_bhs[i]);
-        }
-        kfree(t_bhs);
        return ret;
 }
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
                                 u64 t_blkno,
                                 int t_is_new)
 {
-        int ret, i;
+        int ret;
-        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
-        int blocksize = inode->i_sb->s_blocksize;
-        struct buffer_head **s_bhs, **t_bhs = NULL;
        BUG_ON(s_blkno == t_blkno);
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
             (unsigned long long)s_blkno, (unsigned long long)t_blkno,
             t_is_new);
-        s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+        s_bucket = ocfs2_xattr_bucket_new(inode);
-                        GFP_NOFS);
+        t_bucket = ocfs2_xattr_bucket_new(inode);
-        if (!s_bhs)
+        if (!s_bucket || !t_bucket) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+        ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
        if (ret)
                goto out;
-        t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+        /*
-                        GFP_NOFS);
+         * Even if !t_is_new, we're overwriting t_bucket.  Thus,
-        if (!t_bhs) {
+         * there's no need to read it.
-                ret = -ENOMEM;
+         */
+        ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+        if (ret)
                goto out;
-        }
-        ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+        /*
+         * Hey, if we're overwriting t_bucket, what difference does
+         * ACCESS_CREATE vs ACCESS_WRITE make?  Well, if we allocated a new
+         * cluster to fill, we came here from
+         * ocfs2_mv_xattr_buckets(), and it is really new -
+         * ACCESS_CREATE is required.  But we also might have moved data
+         * out of t_bucket before extending back into it.
+         * ocfs2_add_new_xattr_bucket() can do this - its call to
+         * ocfs2_add_new_xattr_cluster() may have created a new extent
+         * and copied out the end of the old extent.  Then it re-extends
+         * the old extent back to create space for new xattrs.  That's
+         * how we get here, and the bucket isn't really new.
+         */
+        ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+                                                t_is_new ?
+                                                OCFS2_JOURNAL_ACCESS_CREATE :
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret)
                goto out;
-        for (i = 0; i < blk_per_bucket; i++) {
+        ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
-                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+        ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
-                                           t_is_new ?
-                                           OCFS2_JOURNAL_ACCESS_CREATE :
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret)
-                        goto out;
-        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
-                ocfs2_journal_dirty(handle, t_bhs[i]);
-        }
 out:
-        if (s_bhs) {
+        ocfs2_xattr_bucket_free(t_bucket);
-                for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(s_bucket);
-                        brelse(s_bhs[i]);
-        }
-        kfree(s_bhs);
-        if (t_bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(t_bhs[i]);
-        }
-        kfree(t_bhs);
        return ret;
 }
 /*
- * Copy one xattr cluster from src_blk to to_blk.
+ * src_blk points to the start of an existing extent.  last_blk points to
- * The to_blk will become the first bucket header of the cluster, so its
+ * last cluster in that extent.  to_blk points to a newly allocated
- * xh_num_buckets will be initialized as the bucket num in the cluster.
+ * extent.  We copy the buckets from the cluster at last_blk to the new
+ * extent.  If start_bucket is non-zero, we skip that many buckets before
+ * we start copying.  The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied.  The old extent's xh_num_buckets shrinks
+ * by the same amount.
 */
-static int ocfs2_cp_xattr_cluster(struct inode *inode,
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
-                                  handle_t *handle,
+                                  u64 src_blk, u64 last_blk, u64 to_blk,
-                                  struct buffer_head *first_bh,
+                                  unsigned int start_bucket,
-                                  u64 src_blk,
-                                  u64 to_blk,
                                  u32 *first_hash)
 {
        int i, ret, credits;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
-        struct buffer_head *bh = NULL;
+        struct ocfs2_xattr_bucket *old_first, *new_first;
-        struct ocfs2_xattr_header *xh;
-        u64 to_blk_start = to_blk;
+        mlog(0, "mv xattrs from cluster %llu to %llu\n",
+             (unsigned long long)last_blk, (unsigned long long)to_blk);
+        BUG_ON(start_bucket >= num_buckets);
+        if (start_bucket) {
+                num_buckets -= start_bucket;
+                last_blk += (start_bucket * blks_per_bucket);
+        }
+        /* The first bucket of the original extent */
+        old_first = ocfs2_xattr_bucket_new(inode);
+        /* The first bucket of the new extent */
+        new_first = ocfs2_xattr_bucket_new(inode);
+        if (!old_first || !new_first) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        mlog(0, "cp xattrs from cluster %llu to %llu\n",
+        ret = ocfs2_read_xattr_bucket(old_first, src_blk);
-             (unsigned long long)src_blk, (unsigned long long)to_blk);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
        /*
-         * We need to update the new cluster and 1 more for the update of
+         * We need to update the first bucket of the old extent and all
-         * the 1st bucket of the previous extent rec.
+         * the buckets going to the new extent.
         */
-        credits = bpc + 1;
+        credits = ((num_buckets + 1) * blks_per_bucket) +
+                handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, first_bh,
+        ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
        for (i = 0; i < num_buckets; i++) {
                ret = ocfs2_cp_xattr_bucket(inode, handle,
-                                            src_blk, to_blk, 1);
+                                            last_blk + (i * blks_per_bucket),
+                                            to_blk + (i * blks_per_bucket),
+                                            1);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-                to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        }
-        /* update the old bucket header. */
+        /*
-        xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+         * Get the new bucket ready before we dirty anything
-        le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+         * (This actually shouldn't fail, because we already dirtied
+         * it once in ocfs2_cp_xattr_bucket()).
-        ocfs2_journal_dirty(handle, first_bh);
+         */
+        ret = ocfs2_read_xattr_bucket(new_first, to_blk);
-        /* update the new bucket header. */
+        if (ret) {
-        ret = ocfs2_read_block(inode, to_blk_start, &bh);
-        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
+        ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
-        ret = ocfs2_journal_access(handle, inode, bh,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        /* Now update the headers */
-        xh->xh_num_buckets = cpu_to_le16(num_buckets);
+        le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+        ocfs2_xattr_bucket_journal_dirty(handle, old_first);
-        ocfs2_journal_dirty(handle, bh);
+        bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+        ocfs2_xattr_bucket_journal_dirty(handle, new_first);
        if (first_hash)
-                *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+                *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
 out:
-        brelse(bh);
+        ocfs2_xattr_bucket_free(new_first);
+        ocfs2_xattr_bucket_free(old_first);
        return ret;
 }
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        int ret, credits = 2 * blk_per_bucket;
+        int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
 */
 static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
                                            handle_t *handle,
-                                            struct buffer_head **first_bh,
+                                            struct ocfs2_xattr_bucket *first,
-                                            struct buffer_head **header_bh,
+                                            struct ocfs2_xattr_bucket *target,
                                            u64 new_blk,
-                                            u64 prev_blk,
                                            u32 prev_clusters,
                                            u32 *v_start,
                                            int *extend)
 {
-        int ret = 0;
+        int ret;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-             (unsigned long long)prev_blk, prev_clusters,
+             (unsigned long long)bucket_blkno(first), prev_clusters,
             (unsigned long long)new_blk);
-        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
                ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
                                                          handle,
-                                                          first_bh,
+                                                          first, target,
-                                                          header_bh,
                                                          new_blk,
-                                                          prev_blk,
                                                          prev_clusters,
                                                          v_start);
-        else {
+                if (ret)
-                u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+                        mlog_errno(ret);
+        } else {
-                if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+                /* The start of the last cluster in the first extent */
-                        ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+                u64 last_blk = bucket_blkno(first) +
-                                                     last_blk, new_blk,
+                        ((prev_clusters - 1) *
+                         ocfs2_clusters_to_blocks(inode->i_sb, 1));
+                if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+                        ret = ocfs2_mv_xattr_buckets(inode, handle,
+                                                     bucket_blkno(first),
+                                                     last_blk, new_blk, 0,
                                                     v_start);
-                else {
+                        if (ret)
+                                mlog_errno(ret);
+                } else {
                        ret = ocfs2_divide_xattr_cluster(inode, handle,
                                                         last_blk, new_blk,
                                                         v_start);
+                        if (ret)
+                                mlog_errno(ret);
-                        if ((*header_bh)->b_blocknr == last_blk && extend)
+                        if ((bucket_blkno(target) == last_blk) && extend)
                                *extend = 0;
                }
        }
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 */
 static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                                       struct buffer_head *root_bh,
-                                       struct buffer_head **first_bh,
+                                       struct ocfs2_xattr_bucket *first,
-                                       struct buffer_head **header_bh,
+                                       struct ocfs2_xattr_bucket *target,
                                       u32 *num_clusters,
                                       u32 prev_cpos,
-                                       u64 prev_blkno,
+                                       int *extend,
-                                       int *extend)
+                                       struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int ret, credits;
+        int ret;
        u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        u32 prev_clusters = *num_clusters;
        u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
        u64 block;
-        handle_t *handle = NULL;
+        handle_t *handle = ctxt->handle;
-        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
        mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
             "previous xattr blkno = %llu\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             prev_cpos, (unsigned long long)prev_blkno);
+             prev_cpos, (unsigned long long)bucket_blkno(first));
        ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
-        ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+        ret = ocfs2_journal_access_xb(handle, inode, root_bh,
-                                    &data_ac, &meta_ac);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto leave;
-        }
-        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-                                            clusters_to_add);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(ret);
-                goto leave;
-        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto leave;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
                                     clusters_to_add, &bit_off, &num_bits);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        if (prev_blkno + prev_clusters * bpc == block &&
+        if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
            (prev_clusters + num_bits) << osb->s_clustersize_bits <=
             OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
                /*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        } else {
                ret = ocfs2_adjust_xattr_cross_cluster(inode,
                                                       handle,
-                                                       first_bh,
+                                                       first,
-                                                       header_bh,
+                                                       target,
                                                       block,
-                                                       prev_blkno,
                                                       prev_clusters,
                                                       &v_start,
                                                       extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                }
        }
-        if (handle->h_buffer_credits < credits) {
-                /*
-                 * The journal has been restarted before, and don't
-                 * have enough space for the insertion, so extend it
-                 * here.
-                 */
-                ret = ocfs2_extend_trans(handle, credits);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto leave;
-                }
-        }
        mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
             num_bits, (unsigned long long)block, v_start);
        ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
-                                  num_bits, 0, meta_ac);
+                                  num_bits, 0, ctxt->meta_ac);
        if (ret < 0) {
                mlog_errno(ret);
                goto leave;
        }
        ret = ocfs2_journal_dirty(handle, root_bh);
-        if (ret < 0) {
+        if (ret < 0)
                mlog_errno(ret);
-                goto leave;
-        }
 leave:
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-        if (data_ac)
-                ocfs2_free_alloc_context(data_ac);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
        return ret;
 }
 /*
- * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We are given an extent.  'first' is the bucket at the very front of
- * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ * the extent.  The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets.  'target_blkno' is the block number
+ * of the target bucket.  We wish to shift every bucket past the target
+ * down one, filling in that additional space.  When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
 */
 static int ocfs2_extend_xattr_bucket(struct inode *inode,
-                                     struct buffer_head *first_bh,
+                                     handle_t *handle,
-                                     struct buffer_head *start_bh,
+                                     struct ocfs2_xattr_bucket *first,
+                                     u64 target_blk,
                                     u32 num_clusters)
 {
        int ret, credits;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        u64 start_blk = start_bh->b_blocknr, end_blk;
+        u64 end_blk;
-        u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+        u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
-        handle_t *handle;
-        struct ocfs2_xattr_header *first_xh =
-                                (struct ocfs2_xattr_header *)first_bh->b_data;
-        u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
        mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-             "from %llu, len = %u\n", (unsigned long long)start_blk,
+             "from %llu, len = %u\n", (unsigned long long)target_blk,
-             (unsigned long long)first_bh->b_blocknr, num_clusters);
+             (unsigned long long)bucket_blkno(first), num_clusters);
-        BUG_ON(bucket >= num_buckets);
+        /* The extent must have room for an additional bucket */
+        BUG_ON(new_bucket >=
+               (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
-        end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+        /* end_blk points to the last existing bucket */
+        end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
        /*
-         * We will touch all the buckets after the start_bh(include it).
+         * end_blk is the start of the last existing bucket.
-         * Add one more bucket and modify the first_bh.
+         * Thus, (end_blk - target_blk) covers the target bucket and
+         * every bucket after it up to, but not including, the last
+         * existing bucket.  Then we add the last existing bucket, the
+         * new bucket, and the first bucket (3 * blk_per_bucket).
         */
-        credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+        credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
-        handle = ocfs2_start_trans(osb, credits);
+                  handle->h_buffer_credits;
-        if (IS_ERR(handle)) {
+        ret = ocfs2_extend_trans(handle, credits);
-                ret = PTR_ERR(handle);
+        if (ret) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, first_bh,
+        ret = ocfs2_xattr_bucket_journal_access(handle, first,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto commit;
+                goto out;
        }
-        while (end_blk != start_blk) {
+        while (end_blk != target_blk) {
                ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
                                            end_blk + blk_per_bucket, 0);
                if (ret)
-                        goto commit;
+                        goto out;
                end_blk -= blk_per_bucket;
        }
-        /* Move half of the xattr in start_blk to the next bucket. */
+        /* Move half of the xattr in target_blkno to the next bucket. */
-        ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
+        ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
-                                        start_blk + blk_per_bucket, NULL, 0);
+                                        target_blk + blk_per_bucket, NULL, 0);
-        le16_add_cpu(&first_xh->xh_num_buckets, 1);
+        le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
-        ocfs2_journal_dirty(handle, first_bh);
+        ocfs2_xattr_bucket_journal_dirty(handle, first);
-commit:
-        ocfs2_commit_trans(osb, handle);
 out:
        return ret;
 }
 /*
- * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * Add new xattr bucket in an extent record and adjust the buckets
- * xb_bh is the ocfs2_xattr_block.
+ * accordingly.  xb_bh is the ocfs2_xattr_block, and target is the
- * We will move all the buckets starting from header_bh to the next place. As
+ * bucket we want to insert into.
- * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
 *
- * We will allocate a new cluster if current cluster is full and adjust
+ * If current cluster is full, we'll allocate a new one.  This may not
- * header_bh and first_bh if the insert place is moved to the new cluster.
+ * be contiguous.  The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
 */
 static int ocfs2_add_new_xattr_bucket(struct inode *inode,
                                      struct buffer_head *xb_bh,
-                                      struct buffer_head *header_bh)
+                                      struct ocfs2_xattr_bucket *target,
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        struct ocfs2_xattr_header *first_xh = NULL;
-        struct buffer_head *first_bh = NULL;
        struct ocfs2_xattr_block *xb =
                        (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
        struct ocfs2_extent_list *el = &xb_root->xt_list;
-        struct ocfs2_xattr_header *xh =
+        u32 name_hash =
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
+                le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
-        u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct super_block *sb = inode->i_sb;
-        struct ocfs2_super *osb = OCFS2_SB(sb);
        int ret, num_buckets, extend = 1;
        u64 p_blkno;
        u32 e_cpos, num_clusters;
+        /* The bucket at the front of the extent */
+        struct ocfs2_xattr_bucket *first;
-        mlog(0, "Add new xattr bucket starting form %llu\n",
+        mlog(0, "Add new xattr bucket starting from %llu\n",
-             (unsigned long long)header_bh->b_blocknr);
+             (unsigned long long)bucket_blkno(target));
-        /*
+        /* The first bucket of the original extent */
-         * Add refrence for header_bh here because it may be
+        first = ocfs2_xattr_bucket_new(inode);
-         * changed in ocfs2_add_new_xattr_cluster and we need
+        if (!first) {
-         * to free it in the end.
+                ret = -ENOMEM;
-         */
+                mlog_errno(ret);
-        get_bh(header_bh);
+                goto out;
+        }
        ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
                                  &num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+        ret = ocfs2_read_xattr_bucket(first, p_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
-        first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+        if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+                /*
-        if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+                 * This can move first+target if the target bucket moves
+                 * to the new extent.
+                 */
                ret = ocfs2_add_new_xattr_cluster(inode,
                                                  xb_bh,
-                                                  &first_bh,
+                                                  first,
-                                                  &header_bh,
+                                                  target,
                                                  &num_clusters,
                                                  e_cpos,
-                                                  p_blkno,
+                                                  &extend,
-                                                  &extend);
+                                                  ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        }
-        if (extend)
+        if (extend) {
                ret = ocfs2_extend_xattr_bucket(inode,
-                                                first_bh,
+                                                ctxt->handle,
-                                                header_bh,
+                                                first,
+                                                bucket_blkno(target),
                                                num_clusters);
-        if (ret)
+                if (ret)
-                mlog_errno(ret);
+                        mlog_errno(ret);
+        }
 out:
-        brelse(first_bh);
+        ocfs2_xattr_bucket_free(first);
-        brelse(header_bh);
        return ret;
 }
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
        int block_off = offs >> inode->i_sb->s_blocksize_bits;
        offs = offs % inode->i_sb->s_blocksize;
-        return bucket->bhs[block_off]->b_data + offs;
+        return bucket_block(bucket, block_off) + offs;
 }
 /*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
                                xe->xe_value_size = 0;
                        val = ocfs2_xattr_bucket_get_val(inode,
-                                                         &xs->bucket, offs);
+                                                         xs->bucket, offs);
                        memset(val + OCFS2_XATTR_SIZE(name_len), 0,
                               size - OCFS2_XATTR_SIZE(name_len));
                        if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
                xh->xh_free_start = cpu_to_le16(offs);
        }
-        val = ocfs2_xattr_bucket_get_val(inode,
+        val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
-                                         &xs->bucket, offs - size);
        xe->xe_name_offset = cpu_to_le16(offs - size);
        memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
        return;
 }
-static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
-                                             handle_t *handle,
-                                             struct ocfs2_xattr_search *xs,
-                                             struct buffer_head **bhs,
-                                             u16 bh_num)
-{
-        int ret = 0, off, block_off;
-        struct ocfs2_xattr_entry *xe = xs->here;
-        /*
-         * First calculate all the blocks we should journal_access
-         * and journal_dirty. The first block should always be touched.
-         */
-        ret = ocfs2_journal_dirty(handle, bhs[0]);
-        if (ret)
-                mlog_errno(ret);
-        /* calc the data. */
-        off = le16_to_cpu(xe->xe_name_offset);
-        block_off = off >> inode->i_sb->s_blocksize_bits;
-        ret = ocfs2_journal_dirty(handle, bhs[block_off]);
-        if (ret)
-                mlog_errno(ret);
-        return ret;
-}
 /*
 * Set the xattr entry in the specified bucket.
 * The bucket is indicated by xs->bucket and it should have the enough
 * space for the xattr insertion.
 */
 static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+                                           handle_t *handle,
                                           struct ocfs2_xattr_info *xi,
                                           struct ocfs2_xattr_search *xs,
                                           u32 name_hash,
                                           int local)
 {
-        int i, ret;
+        int ret;
-        handle_t *handle = NULL;
+        u64 blkno;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
             (unsigned long)xi->value_len, xi->name_index,
-             (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+             (unsigned long long)bucket_blkno(xs->bucket));
-        if (!xs->bucket.bhs[1]) {
+        if (!xs->bucket->bu_bhs[1]) {
-                ret = ocfs2_read_blocks(inode,
+                blkno = bucket_blkno(xs->bucket);
-                                        xs->bucket.bhs[0]->b_blocknr + 1,
+                ocfs2_xattr_bucket_relse(xs->bucket);
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
+                ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
-                                        0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        }
-        handle = ocfs2_start_trans(osb, blk_per_bucket);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
+        if (ret < 0) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
        ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        /*Only dirty the blocks we have touched in set xattr. */
-        ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
-                                                xs->bucket.bhs, blk_per_bucket);
-        if (ret)
-                mlog_errno(ret);
-out:
-        ocfs2_commit_trans(osb, handle);
-        return ret;
-}
-static int ocfs2_xattr_value_update_size(struct inode *inode,
-                                         struct buffer_head *xe_bh,
-                                         struct ocfs2_xattr_entry *xe,
-                                         u64 new_size)
-{
-        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        handle_t *handle = NULL;
-        handle = ocfs2_start_trans(osb, 1);
-        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xe_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        xe->xe_value_size = cpu_to_le64(new_size);
-        ret = ocfs2_journal_dirty(handle, xe_bh);
-        if (ret < 0)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
 out:
        return ret;
 }
@@ -4210,18 +4697,19 @@ out:
 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
 */
 static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
-                                             struct buffer_head *header_bh,
+                                             struct ocfs2_xattr_bucket *bucket,
                                             int xe_off,
-                                             int len)
+                                             int len,
+                                             struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, offset;
        u64 value_blk;
-        struct buffer_head *value_bh = NULL;
-        struct ocfs2_xattr_value_root *xv;
        struct ocfs2_xattr_entry *xe;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
        size_t blocksize = inode->i_sb->s_blocksize;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_access = ocfs2_journal_access,
+        };
        xe = &xh->xh_entries[xe_off];
@@ -4234,49 +4722,57 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
        /* We don't allow ocfs2_xattr_value to be stored in different block. */
        BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
-        value_blk += header_bh->b_blocknr;
-        ret = ocfs2_read_block(inode, value_blk, &value_bh);
+        vb.vb_bh = bucket->bu_bhs[value_blk];
-        if (ret) {
+        BUG_ON(!vb.vb_bh);
-                mlog_errno(ret);
-                goto out;
-        }
-        xv = (struct ocfs2_xattr_value_root *)
+        vb.vb_xv = (struct ocfs2_xattr_value_root *)
-                (value_bh->b_data + offset % blocksize);
+                (vb.vb_bh->b_data + offset % blocksize);
+        /*
+         * From here on out we have to dirty the bucket.  The generic
+         * value calls only modify one of the bucket's bhs, but we need
+         * to send the bucket at once.  So if they error, they *could* have
+         * modified something.  We have to assume they did, and dirty
+         * the whole bucket.  This leaves us in a consistent state.
+         */
        mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
-             xe_off, (unsigned long long)header_bh->b_blocknr, len);
+             xe_off, (unsigned long long)bucket_blkno(bucket), len);
-        ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+        ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+        ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
+        xe->xe_value_size = cpu_to_le64(len);
+        ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
 out:
-        brelse(value_bh);
        return ret;
 }
 static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
-                                                struct ocfs2_xattr_search *xs,
+                                        struct ocfs2_xattr_search *xs,
-                                                int len)
+                                        int len,
+                                        struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, offset;
        struct ocfs2_xattr_entry *xe = xs->here;
        struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
-        BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+        BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
        offset = xe - xh->xh_entries;
-        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
-                                                offset, len);
+                                                offset, len, ctxt);
        if (ret)
                mlog_errno(ret);
@@ -4284,6 +4780,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 }
 static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+                                                handle_t *handle,
                                                struct ocfs2_xattr_search *xs,
                                                char *val,
                                                int value_len)
@@ -4299,7 +4796,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
        xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
-        return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+        return __ocfs2_xattr_set_value_outside(inode, handle,
+                                               xv, val, value_len);
 }
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4841,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
        if (IS_ERR(handle)) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_journal_access_xb(handle, inode, root_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -4392,26 +4890,19 @@ out:
 }
 static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+                                         handle_t *handle,
                                         struct ocfs2_xattr_search *xs)
 {
-        handle_t *handle = NULL;
+        struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-        struct ocfs2_xattr_header *xh = xs->bucket.xh;
        struct ocfs2_xattr_entry *last = &xh->xh_entries[
                                                le16_to_cpu(xh->xh_count) - 1];
        int ret = 0;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                return;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                return;
        }
        /* Remove the old entry. */
@@ -4420,11 +4911,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
        memset(last, 0, sizeof(struct ocfs2_xattr_entry));
        le16_add_cpu(&xh->xh_count, -1);
-        ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        if (ret < 0)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 }
 /*
@@ -4440,7 +4927,8 @@ out_commit:
 */
 static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                                     struct ocfs2_xattr_info *xi,
-                                     struct ocfs2_xattr_search *xs)
+                                     struct ocfs2_xattr_search *xs,
+                                     struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, local = 1;
        size_t value_len;
@@ -4468,7 +4956,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                        value_len = 0;
                ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                           value_len);
+                                                           value_len,
+                                                           ctxt);
                if (ret)
                        goto out;
@@ -4488,7 +4977,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                xi->value_len = OCFS2_XATTR_ROOT_SIZE;
        }
-        ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+        ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
+                                              name_hash, local);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -4499,7 +4989,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
        /* allocate the space now for the outside block storage. */
        ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                   value_len);
+                                                   value_len, ctxt);
        if (ret) {
                mlog_errno(ret);
@@ -4509,13 +4999,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                         * storage and we have allocated xattr already,
                         * so need to remove it.
                         */
-                        ocfs2_xattr_bucket_remove_xs(inode, xs);
+                        ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
                }
                goto out;
        }
 set_value_outside:
-        ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+        ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
+                                                   xs, val, value_len);
 out:
        return ret;
 }
@@ -4530,7 +5021,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
                                              struct ocfs2_xattr_bucket *bucket,
                                              const char *name)
 {
-        struct ocfs2_xattr_header *xh = bucket->xh;
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
        u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
        if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5031,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
            xh->xh_entries[0].xe_name_hash) {
                mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
                     "hash = %u\n",
-                     (unsigned long long)bucket->bhs[0]->b_blocknr,
+                     (unsigned long long)bucket_blkno(bucket),
                     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
                return -ENOSPC;
        }
@@ -4550,16 +5041,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
                                             struct ocfs2_xattr_info *xi,
-                                             struct ocfs2_xattr_search *xs)
+                                             struct ocfs2_xattr_search *xs,
+                                             struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
        u16 count, header_size, xh_free_start;
-        int i, free, max_free, need, old;
+        int free, max_free, need, old;
        size_t value_size = 0, name_len = strlen(xi->name);
        size_t blocksize = inode->i_sb->s_blocksize;
        int ret, allocation = 0;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        mlog_entry("Set xattr %s in xattr index block\n", xi->name);
@@ -4574,7 +5065,7 @@ try_again:
        mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
                        "of %u which exceed block size\n",
-                        (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+                        (unsigned long long)bucket_blkno(xs->bucket),
                        header_size);
        if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5105,13 @@ try_again:
        mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
             "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
             " %u\n", xs->not_found,
-             (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+             (unsigned long long)bucket_blkno(xs->bucket),
             free, need, max_free, le16_to_cpu(xh->xh_free_start),
             le16_to_cpu(xh->xh_name_value_len));
-        if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+        if (free < need ||
+            (xs->not_found &&
+             count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
                if (need <= max_free &&
                    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
                        /*
@@ -4626,7 +5119,8 @@ try_again:
                         * name/value will be moved, the xe shouldn't be changed
                         * in xs.
                         */
-                        ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+                        ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+                                                        xs->bucket);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -4658,7 +5152,7 @@ try_again:
                 * add a new bucket for the insert.
                 */
                ret = ocfs2_check_xattr_bucket_collision(inode,
-                                                         &xs->bucket,
+                                                         xs->bucket,
                                                         xi->name);
                if (ret) {
                        mlog_errno(ret);
@@ -4667,17 +5161,21 @@ try_again:
                ret = ocfs2_add_new_xattr_bucket(inode,
                                                 xs->xattr_bh,
-                                                 xs->bucket.bhs[0]);
+                                                 xs->bucket,
+                                                 ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                for (i = 0; i < blk_per_bucket; i++)
+                /*
-                        brelse(xs->bucket.bhs[i]);
+                 * ocfs2_add_new_xattr_bucket() will have updated
+                 * xs->bucket if it moved, but it will not have updated
-                memset(&xs->bucket, 0, sizeof(xs->bucket));
+                 * any of the other search fields.  Thus, we drop it and
+                 * re-search.  Everything should be cached, so it'll be
+                 * quick.
+                 */
+                ocfs2_xattr_bucket_relse(xs->bucket);
                ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
                                                   xi->name_index,
                                                   xi->name, xs);
@@ -4689,7 +5187,7 @@ try_again:
        }
 xattr_set:
-        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
 out:
        mlog_exit(ret);
        return ret;
@@ -4700,24 +5198,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
                                        void *para)
 {
        int ret = 0;
-        struct ocfs2_xattr_header *xh = bucket->xh;
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
        u16 i;
        struct ocfs2_xattr_entry *xe;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+        int credits = ocfs2_remove_extent_credits(osb->sb) +
+                ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
        for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
                xe = &xh->xh_entries[i];
                if (ocfs2_xattr_is_local(xe))
                        continue;
-                ret = ocfs2_xattr_bucket_value_truncate(inode,
+                ctxt.handle = ocfs2_start_trans(osb, credits);
-                                                        bucket->bhs[0],
+                if (IS_ERR(ctxt.handle)) {
-                                                        i, 0);
+                        ret = PTR_ERR(ctxt.handle);
+                        mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+                                                        i, 0, &ctxt);
+                ocfs2_commit_trans(osb, ctxt.handle);
                if (ret) {
                        mlog_errno(ret);
                        break;
                }
        }
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
        return ret;
 }
@@ -4768,6 +5283,74 @@ out:
 }
 /*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+                                        size_t list_size, const char *name,
+                                        size_t name_len)
+{
+        const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+                memcpy(list + prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+                                    void *buffer, size_t size)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+                               buffer, size);
+}
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+                                    const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+                               size, flags);
+}
+int ocfs2_init_security_get(struct inode *inode,
+                            struct inode *dir,
+                            struct ocfs2_security_xattr_info *si)
+{
+        /* check whether ocfs2 support feature xattr */
+        if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+                return -EOPNOTSUPP;
+        return security_inode_init_security(inode, dir, &si->name, &si->value,
+                                            &si->value_len);
+}
+int ocfs2_init_security_set(handle_t *handle,
+                            struct inode *inode,
+                            struct buffer_head *di_bh,
+                            struct ocfs2_security_xattr_info *si,
+                            struct ocfs2_alloc_context *xattr_ac,
+                            struct ocfs2_alloc_context *data_ac)
+{
+        return ocfs2_xattr_set_handle(handle, inode, di_bh,
+                                     OCFS2_XATTR_INDEX_SECURITY,
+                                     si->name, si->value, si->value_len, 0,
+                                     xattr_ac, data_ac);
+}
+struct xattr_handler ocfs2_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = ocfs2_xattr_security_list,
+        .get    = ocfs2_xattr_security_get,
+        .set    = ocfs2_xattr_security_set,
+};
+/*
 * 'trusted' attributes support
 */
 static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656d..5a1ebc789f7e 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
        OCFS2_XATTR_MAX
 };
+struct ocfs2_security_xattr_info {
+        int enable;
+        char *name;
+        void *value;
+        size_t value_len;
+};
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern struct xattr_handler ocfs2_xattr_security_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+                           const char *, void *, size_t);
 int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
                    size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+                           int, const char *, const void *, size_t, int,
+                           struct ocfs2_alloc_context *,
+                           struct ocfs2_alloc_context *);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+                            struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+                            struct buffer_head *,
+                            struct ocfs2_security_xattr_info *,
+                            struct ocfs2_alloc_context *,
+                            struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+                             struct ocfs2_security_xattr_info *,
+                             int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+                          int, struct ocfs2_security_xattr_info *,
+                          int *, int *, struct ocfs2_alloc_context **);
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block.  Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal.  Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+        struct buffer_head              *vb_bh;
+        ocfs2_journal_access_func       vb_access;
+        struct ocfs2_xattr_value_root   *vb_xv;
+};
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
new file mode 100644
index 000000000000..b1b9a0aba6fd
--- /dev/null
+++ b/fs/omfs/Kconfig
@@ -0,0 +1,13 @@
+config OMFS_FS
+        tristate "SonicBlue Optimized MPEG File System support"
+        depends on BLOCK
+        select CRC_ITU_T
+        help
+          This is the proprietary file system used by the Rio Karma music
+          player and ReplayTV DVR.  Despite the name, this filesystem is not
+          more efficient than a standard FS for MPEG files, in fact likely
+          the opposite is true.  Say Y if you have either of these devices
+          and wish to mount its disk.
+          To compile this file system support as a module, choose M here: the
+          module will be called omfs.  If unsure, say N.
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f84..633e9dc972bb 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-        inode->i_blocks = 0;
        inode->i_mapping->a_ops = &omfs_aops;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index c0a426d5766c..a3a78ceb2a2b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -122,7 +122,7 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
        return 0;
 }
-asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * buf)
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
        struct path path;
        int error;
@@ -138,8 +138,7 @@ asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * b
        return error;
 }
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
-asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct statfs64 __user *buf)
 {
        struct path path;
        long error;
@@ -157,8 +156,7 @@ asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct stat
        return error;
 }
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
-asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
 {
        struct file * file;
        struct statfs tmp;
@@ -176,7 +174,7 @@ out:
        return error;
 }
-asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user *buf)
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
        struct file * file;
        struct statfs64 tmp;
@@ -272,6 +270,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
                goto put_write_and_out;
        error = locks_verify_truncate(inode, NULL, length);
+        if (!error)
+                error = security_path_truncate(&path, length, 0);
        if (!error) {
                DQUOT_INIT(inode);
                error = do_truncate(path.dentry, length, 0, NULL);
@@ -287,7 +287,7 @@ out:
        return error;
 }
-asmlinkage long sys_truncate(const char __user * path, unsigned long length)
+SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length)
 {
        /* on 32-bit boxen it will cut the range 2^31--2^32-1 off */
        return do_sys_truncate(path, (long)length);
@@ -329,6 +329,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
        error = locks_verify_truncate(inode, file, length);
        if (!error)
+                error = security_path_truncate(&file->f_path, length,
+                                               ATTR_MTIME|ATTR_CTIME);
+        if (!error)
                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
        fput(file);
@@ -336,7 +339,7 @@ out:
        return error;
 }
-asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
 {
        long ret = do_sys_ftruncate(fd, length, 1);
        /* avoid REGPARM breakage on x86: */
@@ -346,21 +349,35 @@ asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
 /* LFS versions of truncate are only needed on 32 bit machines */
 #if BITS_PER_LONG == 32
-asmlinkage long sys_truncate64(const char __user * path, loff_t length)
+SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
 {
        return do_sys_truncate(path, length);
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_truncate64(long path, loff_t length)
+{
+        return SYSC_truncate64((const char __user *) path, length);
+}
+SYSCALL_ALIAS(sys_truncate64, SyS_truncate64);
+#endif
-asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
+SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length)
 {
        long ret = do_sys_ftruncate(fd, length, 0);
        /* avoid REGPARM breakage on x86: */
        asmlinkage_protect(2, ret, fd, length);
        return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_ftruncate64(long fd, loff_t length)
+{
+        return SYSC_ftruncate64((unsigned int) fd, length);
+}
+SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
 #endif
+#endif /* BITS_PER_LONG == 32 */
-asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
 {
        struct file *file;
        struct inode *inode;
@@ -407,7 +424,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                goto out_fput;
-        if (inode->i_op && inode->i_op->fallocate)
+        if (inode->i_op->fallocate)
                ret = inode->i_op->fallocate(inode, mode, offset, len);
        else
                ret = -EOPNOTSUPP;
@@ -417,13 +434,20 @@ out_fput:
 out:
        return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
+{
+        return SYSC_fallocate((int)fd, (int)mode, offset, len);
+}
+SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
+#endif
 /*
 * access() needs to use the real uid/gid, not the effective uid/gid.
 * We do this by temporarily clearing all FS-related capabilities and
 * switching the fsuid/fsgid around to the real ones.
 */
-asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
+SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 {
        const struct cred *old_cred;
        struct cred *override_cred;
@@ -493,12 +517,12 @@ out:
        return res;
 }
-asmlinkage long sys_access(const char __user *filename, int mode)
+SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
 {
        return sys_faccessat(AT_FDCWD, filename, mode);
 }
-asmlinkage long sys_chdir(const char __user * filename)
+SYSCALL_DEFINE1(chdir, const char __user *, filename)
 {
        struct path path;
        int error;
@@ -519,7 +543,7 @@ out:
        return error;
 }
-asmlinkage long sys_fchdir(unsigned int fd)
+SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 {
        struct file *file;
        struct inode *inode;
@@ -545,7 +569,7 @@ out:
        return error;
 }
-asmlinkage long sys_chroot(const char __user * filename)
+SYSCALL_DEFINE1(chroot, const char __user *, filename)
 {
        struct path path;
        int error;
@@ -570,7 +594,7 @@ out:
        return error;
 }
-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
+SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 {
        struct inode * inode;
        struct dentry * dentry;
@@ -604,8 +628,7 @@ out:
        return err;
 }
-asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
+SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
-                             mode_t mode)
 {
        struct path path;
        struct inode *inode;
@@ -634,7 +657,7 @@ out:
        return error;
 }
-asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
+SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
 {
        return sys_fchmodat(AT_FDCWD, filename, mode);
 }
@@ -664,7 +687,7 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
        return error;
 }
-asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
+SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 {
        struct path path;
        int error;
@@ -683,8 +706,8 @@ out:
        return error;
 }
-asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
+SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
-                             gid_t group, int flag)
+                gid_t, group, int, flag)
 {
        struct path path;
        int error = -EINVAL;
@@ -708,7 +731,7 @@ out:
        return error;
 }
-asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
+SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 {
        struct path path;
        int error;
@@ -727,8 +750,7 @@ out:
        return error;
 }
+SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
-asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
 {
        struct file * file;
        int error = -EBADF;
@@ -1024,7 +1046,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
        return fd;
 }
-asmlinkage long sys_open(const char __user *filename, int flags, int mode)
+SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
 {
        long ret;
@@ -1037,8 +1059,8 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
        return ret;
 }
-asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
+SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
-                           int mode)
+                int, mode)
 {
        long ret;
@@ -1057,7 +1079,7 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
-asmlinkage long sys_creat(const char __user * pathname, int mode)
+SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode)
 {
        return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
 }
@@ -1093,7 +1115,7 @@ EXPORT_SYMBOL(filp_close);
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
-asmlinkage long sys_close(unsigned int fd)
+SYSCALL_DEFINE1(close, unsigned int, fd)
 {
        struct file * filp;
        struct files_struct *files = current->files;
@@ -1126,14 +1148,13 @@ out_unlock:
        spin_unlock(&files->file_lock);
        return -EBADF;
 }
 EXPORT_SYMBOL(sys_close);
 /*
 * This routine simulates a hangup on the tty, to arrange that users
 * are given clean terminals at login time.
 */
-asmlinkage long sys_vhangup(void)
+SYSCALL_DEFINE0(vhangup)
 {
        if (capable(CAP_SYS_TTY_CONFIG)) {
                tty_vhangup_self();
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de4..ffcd04f0012c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
                break;
        }
-        inode->i_gid = 0;
-        inode->i_uid = 0;
        d_add(dentry, inode);
        return NULL;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d5b213b8a9b..6d720243f5f4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno)
        blk_free_devt(part_devt(part));
        rcu_assign_pointer(ptbl->part[partno], NULL);
+        rcu_assign_pointer(ptbl->last_lookup, NULL);
        kobject_put(part->holder_dir);
        device_del(part_to_dev(part));
@@ -384,9 +385,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        dname = dev_name(ddev);
        if (isdigit(dname[strlen(dname) - 1]))
-                snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
+                dev_set_name(pdev, "%sp%d", dname, partno);
        else
-                snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
+                dev_set_name(pdev, "%s%d", dname, partno);
        device_initialize(pdev);
        pdev->class = &block_class;
@@ -447,16 +448,11 @@ void register_disk(struct gendisk *disk)
        struct block_device *bdev;
        struct disk_part_iter piter;
        struct hd_struct *part;
-        char *s;
        int err;
        ddev->parent = disk->driverfs_dev;
-        strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
+        dev_set_name(ddev, disk->disk_name);
-        /* ewww... some of these buggers have / in the name... */
-        s = strchr(ddev->bus_id, '/');
-        if (s)
-                *s = '!';
        /* delay uevents, until we scanned partition table */
        ddev->uevent_suppress = 1;
diff --git a/fs/pipe.c b/fs/pipe.c
index aaf797bd57b9..3a48ba5179d5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags)
                goto err_fdr;
        fdw = error;
-        error = audit_fd_pair(fdr, fdw);
+        audit_fd_pair(fdr, fdw);
-        if (error < 0)
-                goto err_fdw;
        fd_install(fdr, fr);
        fd_install(fdw, fw);
        fd[0] = fdr;
@@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
        return 0;
- err_fdw:
-        put_unused_fd(fdw);
 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
@@ -1048,7 +1043,7 @@ int do_pipe(int *fd)
 * sys_pipe() is the normal C calling standard for creating
 * a pipe. It's not the way Unix traditionally does this, though.
 */
-asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
+SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
 {
        int fd[2];
        int error;
@@ -1064,7 +1059,7 @@ asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
        return error;
 }
-asmlinkage long __weak sys_pipe(int __user *fildes)
+SYSCALL_DEFINE1(pipe, int __user *, fildes)
 {
        return sys_pipe2(fildes, 0);
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b3..0c9de19a1633 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
 #include <linux/mm.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
 #include <linux/resource.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -109,25 +110,22 @@ struct pid_entry {
        .op   = OP,                                     \
 }
-#define DIR(NAME, MODE, OTYPE)                                                  \
+#define DIR(NAME, MODE, iops, fops)     \
-        NOD(NAME, (S_IFDIR|(MODE)),                                             \
+        NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
-                &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,   \
+#define LNK(NAME, get_link)                                     \
-                {} )
-#define LNK(NAME, OTYPE)                                        \
        NOD(NAME, (S_IFLNK|S_IRWXUGO),                          \
                &proc_pid_link_inode_operations, NULL,          \
-                { .proc_get_link = &proc_##OTYPE##_link } )
+                { .proc_get_link = get_link } )
-#define REG(NAME, MODE, OTYPE)                          \
+#define REG(NAME, MODE, fops)                           \
-        NOD(NAME, (S_IFREG|(MODE)), NULL,               \
+        NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
-                &proc_##OTYPE##_operations, {})
+#define INF(NAME, MODE, read)                           \
-#define INF(NAME, MODE, OTYPE)                          \
        NOD(NAME, (S_IFREG|(MODE)),                     \
                NULL, &proc_info_file_operations,       \
-                { .proc_read = &proc_##OTYPE } )
+                { .proc_read = read } )
-#define ONE(NAME, MODE, OTYPE)                          \
+#define ONE(NAME, MODE, show)                           \
        NOD(NAME, (S_IFREG|(MODE)),                     \
                NULL, &proc_single_file_operations,     \
-                { .proc_show = &proc_##OTYPE } )
+                { .proc_show = show } )
 /*
 * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
        struct mm_struct *mm = get_task_mm(task);
        if (mm) {
                unsigned int nwords = 0;
-                do
+                do {
                        nwords += 2;
-                while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+                } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
                res = nwords * sizeof(mm->saved_auxv[0]);
                if (res > PAGE_SIZE)
                        res = PAGE_SIZE;
@@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_KALLSYMS */
+#ifdef CONFIG_STACKTRACE
+#define MAX_STACK_TRACE_DEPTH   64
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+                          struct pid *pid, struct task_struct *task)
+{
+        struct stack_trace trace;
+        unsigned long *entries;
+        int i;
+        entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
+        if (!entries)
+                return -ENOMEM;
+        trace.nr_entries        = 0;
+        trace.max_entries       = MAX_STACK_TRACE_DEPTH;
+        trace.entries           = entries;
+        trace.skip              = 0;
+        save_stack_trace_tsk(task, &trace);
+        for (i = 0; i < trace.nr_entries; i++) {
+                seq_printf(m, "[<%p>] %pS\n",
+                           (void *)entries[i], (void *)entries[i]);
+        }
+        kfree(entries);
+        return 0;
+}
+#endif
 #ifdef CONFIG_SCHEDSTATS
 /*
 * Provides /proc/PID/schedstat
@@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v)
        struct inode *inode = m->private;
        struct task_struct *p;
-        WARN_ON(!inode);
        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
@@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf,
        struct inode *inode = file->f_path.dentry->d_inode;
        struct task_struct *p;
-        WARN_ON(!inode);
        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
@@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        if (!ei->pid)
                goto out_unlock;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        if (task_dumpable(task)) {
                rcu_read_lock();
                cred = __task_cred(task);
@@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                                         const struct pid_entry *ents,
                                         unsigned int nents)
 {
-        struct inode *inode;
        struct dentry *error;
        struct task_struct *task = get_proc_task(dir);
        const struct pid_entry *p, *last;
        error = ERR_PTR(-ENOENT);
-        inode = NULL;
        if (!task)
                goto out_no_task;
@@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = {
 };
 static const struct pid_entry attr_dir_stuff[] = {
-        REG("current",    S_IRUGO|S_IWUGO, pid_attr),
+        REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("prev",       S_IRUGO,         pid_attr),
+        REG("prev",       S_IRUGO,         proc_pid_attr_operations),
-        REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
+        REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
+        REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
+        REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
+        REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
 };
 static int proc_attr_dir_readdir(struct file * filp,
@@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (!ei->pid)
                goto out_iput;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        inode->i_mode = p->mode;
        if (S_ISDIR(inode->i_mode))
                inode->i_nlink = 2;
@@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations;
 static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
-        DIR("task",       S_IRUGO|S_IXUGO, task),
+        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
-        DIR("fd",         S_IRUSR|S_IXUSR, fd),
+        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-        DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
+        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 #ifdef CONFIG_NET
-        DIR("net",        S_IRUGO|S_IXUGO, net),
+        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
-        REG("environ",    S_IRUSR, environ),
+        REG("environ",    S_IRUSR, proc_environ_operations),
-        INF("auxv",       S_IRUSR, pid_auxv),
+        INF("auxv",       S_IRUSR, proc_pid_auxv),
-        ONE("status",     S_IRUGO, pid_status),
+        ONE("status",     S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, pid_personality),
+        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",     S_IRUSR, pid_limits),
+        INF("limits",     S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-        REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
+        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",    S_IRUSR, pid_syscall),
+        INF("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
-        INF("cmdline",    S_IRUGO, pid_cmdline),
+        INF("cmdline",    S_IRUGO, proc_pid_cmdline),
-        ONE("stat",       S_IRUGO, tgid_stat),
+        ONE("stat",       S_IRUGO, proc_tgid_stat),
-        ONE("statm",      S_IRUGO, pid_statm),
+        ONE("statm",      S_IRUGO, proc_pid_statm),
-        REG("maps",       S_IRUGO, maps),
+        REG("maps",       S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-        REG("numa_maps",  S_IRUGO, numa_maps),
+        REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
 #endif
-        REG("mem",        S_IRUSR|S_IWUSR, mem),
+        REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
-        LNK("cwd",        cwd),
+        LNK("cwd",        proc_cwd_link),
-        LNK("root",       root),
+        LNK("root",       proc_root_link),
-        LNK("exe",        exe),
+        LNK("exe",        proc_exe_link),
-        REG("mounts",     S_IRUGO, mounts),
+        REG("mounts",     S_IRUGO, proc_mounts_operations),
-        REG("mountinfo",  S_IRUGO, mountinfo),
+        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
-        REG("mountstats", S_IRUSR, mountstats),
+        REG("mountstats", S_IRUSR, proc_mountstats_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-        REG("clear_refs", S_IWUSR, clear_refs),
+        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-        REG("smaps",      S_IRUGO, smaps),
+        REG("smaps",      S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, pagemap),
+        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-        DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
+        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-        INF("wchan",      S_IRUGO, pid_wchan),
+        INF("wchan",      S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-        INF("schedstat",  S_IRUGO, pid_schedstat),
+        INF("schedstat",  S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-        REG("latency",  S_IRUGO, lstats),
+        REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-        REG("cpuset",     S_IRUGO, cpuset),
+        REG("cpuset",     S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-        REG("cgroup",  S_IRUGO, cgroup),
+        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-        INF("oom_score",  S_IRUGO, oom_score),
+        INF("oom_score",  S_IRUGO, proc_oom_score),
-        REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
+        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-        REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
+        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
-        REG("sessionid",  S_IRUGO, sessionid),
+        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-        REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
+        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        INF("io",       S_IRUGO, tgid_io_accounting),
+        INF("io",       S_IRUGO, proc_tgid_io_accounting),
 #endif
 };
@@ -2805,66 +2827,69 @@ out_no_task:
 * Tasks
 */
 static const struct pid_entry tid_base_stuff[] = {
-        DIR("fd",        S_IRUSR|S_IXUSR, fd),
+        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-        DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
+        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
-        REG("environ",   S_IRUSR, environ),
+        REG("environ",   S_IRUSR, proc_environ_operations),
-        INF("auxv",      S_IRUSR, pid_auxv),
+        INF("auxv",      S_IRUSR, proc_pid_auxv),
-        ONE("status",    S_IRUGO, pid_status),
+        ONE("status",    S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, pid_personality),
+        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",    S_IRUSR, pid_limits),
+        INF("limits",    S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-        REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
+        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",   S_IRUSR, pid_syscall),
+        INF("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
-        INF("cmdline",   S_IRUGO, pid_cmdline),
+        INF("cmdline",   S_IRUGO, proc_pid_cmdline),
-        ONE("stat",      S_IRUGO, tid_stat),
+        ONE("stat",      S_IRUGO, proc_tid_stat),
-        ONE("statm",     S_IRUGO, pid_statm),
+        ONE("statm",     S_IRUGO, proc_pid_statm),
-        REG("maps",      S_IRUGO, maps),
+        REG("maps",      S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-        REG("numa_maps", S_IRUGO, numa_maps),
+        REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
 #endif
-        REG("mem",       S_IRUSR|S_IWUSR, mem),
+        REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
-        LNK("cwd",       cwd),
+        LNK("cwd",       proc_cwd_link),
-        LNK("root",      root),
+        LNK("root",      proc_root_link),
-        LNK("exe",       exe),
+        LNK("exe",       proc_exe_link),
-        REG("mounts",    S_IRUGO, mounts),
+        REG("mounts",    S_IRUGO, proc_mounts_operations),
-        REG("mountinfo",  S_IRUGO, mountinfo),
+        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-        REG("clear_refs", S_IWUSR, clear_refs),
+        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-        REG("smaps",     S_IRUGO, smaps),
+        REG("smaps",     S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, pagemap),
+        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-        DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
+        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-        INF("wchan",     S_IRUGO, pid_wchan),
+        INF("wchan",     S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-        INF("schedstat", S_IRUGO, pid_schedstat),
+        INF("schedstat", S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-        REG("latency",  S_IRUGO, lstats),
+        REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-        REG("cpuset",    S_IRUGO, cpuset),
+        REG("cpuset",    S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-        REG("cgroup",  S_IRUGO, cgroup),
+        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-        INF("oom_score", S_IRUGO, oom_score),
+        INF("oom_score", S_IRUGO, proc_oom_score),
-        REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
+        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-        REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
+        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-        REG("sessionid",  S_IRUSR, sessionid),
+        REG("sessionid",  S_IRUSR, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        INF("io",       S_IRUGO, tid_io_accounting),
+        INF("io",       S_IRUGO, proc_tid_io_accounting),
 #endif
 };
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 60a359b35582..db7fa5cab988 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -14,7 +14,6 @@
 #include <linux/stat.h>
 #include <linux/module.h>
 #include <linux/mount.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/idr.h>
 #include <linux/namei.h>
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
        struct inode *inode = NULL;
        int error = -ENOENT;
-        lock_kernel();
        spin_lock(&proc_subdir_lock);
        for (de = de->subdir; de ; de = de->next) {
                if (de->namelen != dentry->d_name.len)
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
        }
        spin_unlock(&proc_subdir_lock);
 out_unlock:
-        unlock_kernel();
        if (inode) {
                dentry->d_op = &proc_dentry_operations;
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret = 0;
-        lock_kernel();
        ino = inode->i_ino;
        i = filp->f_pos;
        switch (i) {
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
                        spin_unlock(&proc_subdir_lock);
        }
        ret = 1;
-out:    unlock_kernel();
+out:
        return ret;     
 }
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
 * the /proc directory.
 */
 static const struct file_operations proc_dir_operations = {
+        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
        .readdir                = proc_readdir,
 };
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c658..3e76bb9b3ad6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
 */
 void de_put(struct proc_dir_entry *de)
 {
-        lock_kernel();
        if (!atomic_read(&de->count)) {
                printk("de_put: entry %s already free!\n", de->name);
-                unlock_kernel();
                return;
        }
        if (atomic_dec_and_test(&de->count))
                free_proc_entry(de);
-        unlock_kernel();
 }
 /*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e8aeb8b61ce..cd53ff838498 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -41,8 +41,6 @@ do {						\
        (vmi)->used = 0;                        \
        (vmi)->largest_chunk = 0;               \
 } while(0)
-extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b1675c4e66da..43d23948384a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                "LowTotal:       %8lu kB\n"
                "LowFree:        %8lu kB\n"
 #endif
+#ifndef CONFIG_MMU
+                "MmapCopy:       %8lu kB\n"
+#endif
                "SwapTotal:      %8lu kB\n"
                "SwapFree:       %8lu kB\n"
                "Dirty:          %8lu kB\n"
@@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.totalram-i.totalhigh),
                K(i.freeram-i.freehigh),
 #endif
+#ifndef CONFIG_MMU
+                K((unsigned long) atomic_read(&mmap_pages_allocated)),
+#endif
                K(i.totalswap),
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3f87d2632947..b446d7ad0b0d 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,33 +33,33 @@
 #include "internal.h"
 /*
- * display a single VMA to a sequenced file
+ * display a single region to a sequenced file
 */
-int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 {
        unsigned long ino = 0;
        struct file *file;
        dev_t dev = 0;
        int flags, len;
-        flags = vma->vm_flags;
+        flags = region->vm_flags;
-        file = vma->vm_file;
+        file = region->vm_file;
        if (file) {
-                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+                struct inode *inode = region->vm_file->f_path.dentry->d_inode;
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
        }
        seq_printf(m,
                   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
-                   vma->vm_start,
+                   region->vm_start,
-                   vma->vm_end,
+                   region->vm_end,
                   flags & VM_READ ? 'r' : '-',
                   flags & VM_WRITE ? 'w' : '-',
                   flags & VM_EXEC ? 'x' : '-',
                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-                   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+                   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
                   MAJOR(dev), MINOR(dev), ino, &len);
        if (file) {
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 }
 /*
- * display a list of all the VMAs the kernel knows about
+ * display a list of all the REGIONs the kernel knows about
 * - nommu kernals have a single flat list
 */
-static int nommu_vma_list_show(struct seq_file *m, void *v)
+static int nommu_region_list_show(struct seq_file *m, void *_p)
 {
-        struct vm_area_struct *vma;
+        struct rb_node *p = _p;
-        vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
+        return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
-        return nommu_vma_show(m, vma);
 }
-static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos)
+static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
 {
-        struct rb_node *_rb;
+        struct rb_node *p;
        loff_t pos = *_pos;
-        void *next = NULL;
-        down_read(&nommu_vma_sem);
+        down_read(&nommu_region_sem);
-        for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) {
+        for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
-                if (pos == 0) {
+                if (pos-- == 0)
-                        next = _rb;
+                        return p;
-                        break;
+        return NULL;
-                }
-                pos--;
-        }
-        return next;
 }
-static void nommu_vma_list_stop(struct seq_file *m, void *v)
+static void nommu_region_list_stop(struct seq_file *m, void *v)
 {
-        up_read(&nommu_vma_sem);
+        up_read(&nommu_region_sem);
 }
-static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos)
+static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
 {
        (*pos)++;
        return rb_next((struct rb_node *) v);
 }
-static const struct seq_operations proc_nommu_vma_list_seqop = {
+static struct seq_operations proc_nommu_region_list_seqop = {
-        .start  = nommu_vma_list_start,
+        .start  = nommu_region_list_start,
-        .next   = nommu_vma_list_next,
+        .next   = nommu_region_list_next,
-        .stop   = nommu_vma_list_stop,
+        .stop   = nommu_region_list_stop,
-        .show   = nommu_vma_list_show
+        .show   = nommu_region_list_show
 };
-static int proc_nommu_vma_list_open(struct inode *inode, struct file *file)
+static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
 {
-        return seq_open(file, &proc_nommu_vma_list_seqop);
+        return seq_open(file, &proc_nommu_region_list_seqop);
 }
-static const struct file_operations proc_nommu_vma_list_operations = {
+static const struct file_operations proc_nommu_region_list_operations = {
-        .open    = proc_nommu_vma_list_open,
+        .open    = proc_nommu_region_list_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release,
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
 static int __init proc_nommu_init(void)
 {
-        proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations);
+        proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
        return 0;
 }
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7bc296f424ae..04d1270f1c38 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -18,7 +18,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
 }
 const struct file_operations proc_net_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = proc_tgid_net_readdir,
 };
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9e..94fcfff6863a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
        inode->i_mode = table->mode;
-        inode->i_uid = inode->i_gid = 0;
        if (!table->child) {
                inode->i_mode |= S_IFREG;
                inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7761602af9de..f6299a25594e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,7 +16,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp,
        unsigned int nr = filp->f_pos;
        int ret;
-        lock_kernel();
        if (nr < FIRST_PROCESS_ENTRY) {
                int error = proc_readdir(filp, dirent, filldir);
-                if (error <= 0) {
+                if (error <= 0)
-                        unlock_kernel();
                        return error;
-                }
                filp->f_pos = FIRST_PROCESS_ENTRY;
        }
-        unlock_kernel();
        ret = proc_pid_readdir(filp, dirent, filldir);
        return ret;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 3bb1cf1e7425..f75efa22df5e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,6 +9,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/irqnr.h>
 #include <asm/cputime.h>
 #ifndef arch_irq_stat_cpu
@@ -45,10 +46,6 @@ static int show_stat(struct seq_file *p, void *v)
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
                for_each_irq_nr(j) {
-#ifdef CONFIG_SPARSE_IRQ
-                        if (!irq_to_desc(j))
-                                continue;
-#endif
                        sum += kstat_irqs_cpu(j, i);
                }
                sum += arch_irq_stat_cpu(i);
@@ -95,12 +92,6 @@ static int show_stat(struct seq_file *p, void *v)
        /* sum again ? it could be updated? */
        for_each_irq_nr(j) {
                per_irq_sum = 0;
-#ifdef CONFIG_SPARSE_IRQ
-                if (!irq_to_desc(j)) {
-                        seq_printf(p, " %u", per_irq_sum);
-                        continue;
-                }
-#endif
                for_each_possible_cpu(i)
                        per_irq_sum += kstat_irqs_cpu(j, i);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3a8bdd7f5756..94063840832a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v)
                   "Private_Clean:  %8lu kB\n"
                   "Private_Dirty:  %8lu kB\n"
                   "Referenced:     %8lu kB\n"
-                   "Swap:           %8lu kB\n",
+                   "Swap:           %8lu kB\n"
+                   "KernelPageSize: %8lu kB\n"
+                   "MMUPageSize:    %8lu kB\n",
                   (vma->vm_end - vma->vm_start) >> 10,
                   mss.resident >> 10,
                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.private_clean >> 10,
                   mss.private_dirty >> 10,
                   mss.referenced >> 10,
-                   mss.swap >> 10);
+                   mss.swap >> 10,
+                   vma_kernel_pagesize(vma) >> 10,
+                   vma_mmu_pagesize(vma) >> 10);
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 219bd79ea894..343ea1216bc8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -9,31 +9,38 @@
 /*
 * Logic: we've got two memory sums for each process, "shared", and
- * "non-shared". Shared memory may get counted more then once, for
+ * "non-shared". Shared memory may get counted more than once, for
 * each process that owns it. Non-shared memory is counted
 * accurately.
 */
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-        struct vm_list_struct *vml;
+        struct vm_area_struct *vma;
-        unsigned long bytes = 0, sbytes = 0, slack = 0;
+        struct vm_region *region;
+        struct rb_node *p;
+        unsigned long bytes = 0, sbytes = 0, slack = 0, size;
        
        down_read(&mm->mmap_sem);
-        for (vml = mm->context.vmlist; vml; vml = vml->next) {
+        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
-                if (!vml->vma)
+                vma = rb_entry(p, struct vm_area_struct, vm_rb);
-                        continue;
+                bytes += kobjsize(vma);
+                region = vma->vm_region;
+                if (region) {
+                        size = kobjsize(region);
+                        size += region->vm_end - region->vm_start;
+                } else {
+                        size = vma->vm_end - vma->vm_start;
+                }
-                bytes += kobjsize(vml);
                if (atomic_read(&mm->mm_count) > 1 ||
-                    atomic_read(&vml->vma->vm_usage) > 1
+                    vma->vm_flags & VM_MAYSHARE) {
-                    ) {
+                        sbytes += size;
-                        sbytes += kobjsize((void *) vml->vma->vm_start);
-                        sbytes += kobjsize(vml->vma);
                } else {
-                        bytes += kobjsize((void *) vml->vma->vm_start);
+                        bytes += size;
-                        bytes += kobjsize(vml->vma);
+                        if (region)
-                        slack += kobjsize((void *) vml->vma->vm_start) -
+                                slack = region->vm_end - vma->vm_end;
-                                (vml->vma->vm_end - vml->vma->vm_start);
                }
        }
@@ -70,13 +77,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 unsigned long task_vsize(struct mm_struct *mm)
 {
-        struct vm_list_struct *tbp;
+        struct vm_area_struct *vma;
+        struct rb_node *p;
        unsigned long vsize = 0;
        down_read(&mm->mmap_sem);
-        for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
+        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
-                if (tbp->vma)
+                vma = rb_entry(p, struct vm_area_struct, vm_rb);
-                        vsize += kobjsize((void *) tbp->vma->vm_start);
+                vsize += vma->vm_end - vma->vm_start;
        }
        up_read(&mm->mmap_sem);
        return vsize;
@@ -85,15 +93,19 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
               int *data, int *resident)
 {
-        struct vm_list_struct *tbp;
+        struct vm_area_struct *vma;
+        struct vm_region *region;
+        struct rb_node *p;
        int size = kobjsize(mm);
        down_read(&mm->mmap_sem);
-        for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
+        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
-                size += kobjsize(tbp);
+                vma = rb_entry(p, struct vm_area_struct, vm_rb);
-                if (tbp->vma) {
+                size += kobjsize(vma);
-                        size += kobjsize(tbp->vma);
+                region = vma->vm_region;
-                        size += kobjsize((void *) tbp->vma->vm_start);
+                if (region) {
+                        size += kobjsize(region);
+                        size += region->vm_end - region->vm_start;
                }
        }
@@ -105,20 +117,62 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 }
 /*
+ * display a single VMA to a sequenced file
+ */
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+{
+        unsigned long ino = 0;
+        struct file *file;
+        dev_t dev = 0;
+        int flags, len;
+        flags = vma->vm_flags;
+        file = vma->vm_file;
+        if (file) {
+                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+                dev = inode->i_sb->s_dev;
+                ino = inode->i_ino;
+        }
+        seq_printf(m,
+                   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+                   vma->vm_start,
+                   vma->vm_end,
+                   flags & VM_READ ? 'r' : '-',
+                   flags & VM_WRITE ? 'w' : '-',
+                   flags & VM_EXEC ? 'x' : '-',
+                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+                   vma->vm_pgoff << PAGE_SHIFT,
+                   MAJOR(dev), MINOR(dev), ino, &len);
+        if (file) {
+                len = 25 + sizeof(void *) * 6 - len;
+                if (len < 1)
+                        len = 1;
+                seq_printf(m, "%*c", len, ' ');
+                seq_path(m, &file->f_path, "");
+        }
+        seq_putc(m, '\n');
+        return 0;
+}
+/*
 * display mapping lines for a particular process's /proc/pid/maps
 */
-static int show_map(struct seq_file *m, void *_vml)
+static int show_map(struct seq_file *m, void *_p)
 {
-        struct vm_list_struct *vml = _vml;
+        struct rb_node *p = _p;
-        return nommu_vma_show(m, vml->vma);
+        return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
 }
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
        struct proc_maps_private *priv = m->private;
-        struct vm_list_struct *vml;
        struct mm_struct *mm;
+        struct rb_node *p;
        loff_t n = *pos;
        /* pin the task and mm whilst we play with them */
@@ -134,9 +188,9 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        }
        /* start from the Nth VMA */
-        for (vml = mm->context.vmlist; vml; vml = vml->next)
+        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
                if (n-- == 0)
-                        return vml;
+                        return p;
        return NULL;
 }
@@ -152,12 +206,12 @@ static void m_stop(struct seq_file *m, void *_vml)
        }
 }
-static void *m_next(struct seq_file *m, void *_vml, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
 {
-        struct vm_list_struct *vml = _vml;
+        struct rb_node *p = _p;
        (*pos)++;
-        return vml ? vml->next : NULL;
+        return p ? rb_next(p) : NULL;
 }
 static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec59504906..5edcc3f92ba7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
        offset = (unsigned long)(*ppos % PAGE_SIZE);
        pfn = (unsigned long)(*ppos / PAGE_SIZE);
-        if (pfn > saved_max_pfn)
-                return -EINVAL;
        do {
                if (count > (PAGE_SIZE - offset))
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
new file mode 100644
index 000000000000..be8e0e1445b6
--- /dev/null
+++ b/fs/qnx4/Kconfig
@@ -0,0 +1,25 @@
+config QNX4FS_FS
+        tristate "QNX4 file system support (read only)"
+        depends on BLOCK
+        help
+          This is the file system used by the real-time operating systems
+          QNX 4 and QNX 6 (the latter is also called QNX RTP).
+          Further information is available at <http://www.qnx.com/>.
+          Say Y if you intend to mount QNX hard disks or floppies.
+          Unless you say Y to "QNX4FS read-write support" below, you will
+          only be able to read these file systems.
+          To compile this file system support as a module, choose M here: the
+          module will be called qnx4.
+          If you don't know whether you need it, then you don't need it:
+          answer N.
+config QNX4FS_RW
+        bool "QNX4FS write support (DANGEROUS)"
+        depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
+        help
+          Say Y if you want to test write support for QNX4 file systems.
+          It's currently broken, so for now:
+          answer N.
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e01618..d76ada914f98 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
                case Q_SETQUOTA:
                case Q_GETQUOTA:
                        /* This is just informative test so we are satisfied without a lock */
-                        if (!sb_has_quota_enabled(sb, type))
+                        if (!sb_has_quota_active(sb, type))
                                return -ESRCH;
        }
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
        int cnt;
        sb->s_qcop->quota_sync(sb, type);
+        if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
+                return;
        /* This is not very clever (and fast) but currently I don't know about
         * any other simple way of getting quota data to disk and we must get
         * them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
-                if (!sb_has_quota_enabled(sb, cnt))
+                if (!sb_has_quota_active(sb, cnt))
                        continue;
                mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
                truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
                for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                        if (type != -1 && type != cnt)
                                continue;
-                        if (!sb_has_quota_enabled(sb, cnt))
+                        if (!sb_has_quota_active(sb, cnt))
                                continue;
                        if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
                            list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
                        __u32 fmt;
                        down_read(&sb_dqopt(sb)->dqptr_sem);
-                        if (!sb_has_quota_enabled(sb, type)) {
+                        if (!sb_has_quota_active(sb, type)) {
                                up_read(&sb_dqopt(sb)->dqptr_sem);
                                return -ESRCH;
                        }
@@ -368,7 +371,8 @@ static inline struct super_block *quotactl_block(const char __user *special)
 * calls. Maybe we need to add the process quotas etc. in the future,
 * but we probably should use rlimits for that.
 */
-asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr)
+SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
+                qid_t, id, void __user *, addr)
 {
        uint cmds, type;
        struct super_block *sb = NULL;
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 000000000000..953404c95b17
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
+/*
+ *      vfsv0 quota IO operations on file
+ */
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+#include <asm/byteorder.h>
+#include "quota_tree.h"
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+#define __QUOTA_QT_PARANOIA
+typedef char *dqbuf_t;
+static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
+{
+        unsigned int epb = info->dqi_usable_bs >> 2;
+        depth = info->dqi_qtree_depth - depth - 1;
+        while (depth--)
+                id /= epb;
+        return id % epb;
+}
+/* Number of entries in one blocks */
+static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+        return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+               / info->dqi_entry_size;
+}
+static dqbuf_t getdqbuf(size_t size)
+{
+        dqbuf_t buf = kmalloc(size, GFP_NOFS);
+        if (!buf)
+                printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+        return buf;
+}
+static inline void freedqbuf(dqbuf_t buf)
+{
+        kfree(buf);
+}
+static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+        struct super_block *sb = info->dqi_sb;
+        memset(buf, 0, info->dqi_usable_bs);
+        return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+        struct super_block *sb = info->dqi_sb;
+        return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        int ret, blk;
+        if (!buf)
+                return -ENOMEM;
+        if (info->dqi_free_blk) {
+                blk = info->dqi_free_blk;
+                ret = read_blk(info, blk, buf);
+                if (ret < 0)
+                        goto out_buf;
+                info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+        }
+        else {
+                memset(buf, 0, info->dqi_usable_bs);
+                /* Assure block allocation... */
+                ret = write_blk(info, info->dqi_blocks, buf);
+                if (ret < 0)
+                        goto out_buf;
+                blk = info->dqi_blocks++;
+        }
+        mark_info_dirty(info->dqi_sb, info->dqi_type);
+        ret = blk;
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        int err;
+        dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+        dh->dqdh_prev_free = cpu_to_le32(0);
+        dh->dqdh_entries = cpu_to_le16(0);
+        err = write_blk(info, blk, buf);
+        if (err < 0)
+                return err;
+        info->dqi_free_blk = blk;
+        mark_info_dirty(info->dqi_sb, info->dqi_type);
+        return 0;
+}
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+        dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+        uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+        int err;
+        if (!tmpbuf)
+                return -ENOMEM;
+        if (nextblk) {
+                err = read_blk(info, nextblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+                ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+                                                        dh->dqdh_prev_free;
+                err = write_blk(info, nextblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+        }
+        if (prevblk) {
+                err = read_blk(info, prevblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+                ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+                                                        dh->dqdh_next_free;
+                err = write_blk(info, prevblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+        } else {
+                info->dqi_free_entry = nextblk;
+                mark_info_dirty(info->dqi_sb, info->dqi_type);
+        }
+        freedqbuf(tmpbuf);
+        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+        /* No matter whether write succeeds block is out of list */
+        if (write_blk(info, blk, buf) < 0)
+                printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+        return 0;
+out_buf:
+        freedqbuf(tmpbuf);
+        return err;
+}
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+        dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        int err;
+        if (!tmpbuf)
+                return -ENOMEM;
+        dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+        dh->dqdh_prev_free = cpu_to_le32(0);
+        err = write_blk(info, blk, buf);
+        if (err < 0)
+                goto out_buf;
+        if (info->dqi_free_entry) {
+                err = read_blk(info, info->dqi_free_entry, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+                ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+                                                        cpu_to_le32(blk);
+                err = write_blk(info, info->dqi_free_entry, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+        }
+        freedqbuf(tmpbuf);
+        info->dqi_free_entry = blk;
+        mark_info_dirty(info->dqi_sb, info->dqi_type);
+        return 0;
+out_buf:
+        freedqbuf(tmpbuf);
+        return err;
+}
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+        int i;
+        for (i = 0; i < info->dqi_entry_size; i++)
+                if (disk[i])
+                        return 0;
+        return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+                              struct dquot *dquot, int *err)
+{
+        uint blk, i;
+        struct qt_disk_dqdbheader *dh;
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        char *ddquot;
+        *err = 0;
+        if (!buf) {
+                *err = -ENOMEM;
+                return 0;
+        }
+        dh = (struct qt_disk_dqdbheader *)buf;
+        if (info->dqi_free_entry) {
+                blk = info->dqi_free_entry;
+                *err = read_blk(info, blk, buf);
+                if (*err < 0)
+                        goto out_buf;
+        } else {
+                blk = get_free_dqblk(info);
+                if ((int)blk < 0) {
+                        *err = blk;
+                        freedqbuf(buf);
+                        return 0;
+                }
+                memset(buf, 0, info->dqi_usable_bs);
+                /* This is enough as block is already zeroed and entry list is empty... */
+                info->dqi_free_entry = blk;
+                mark_info_dirty(dquot->dq_sb, dquot->dq_type);
+        }
+        /* Block will be full? */
+        if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+                *err = remove_free_dqentry(info, buf, blk);
+                if (*err < 0) {
+                        printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+                               "remove block (%u) from entry free list.\n",
+                               blk);
+                        goto out_buf;
+                }
+        }
+        le16_add_cpu(&dh->dqdh_entries, 1);
+        /* Find free structure in block */
+        for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+             i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
+             i++, ddquot += info->dqi_entry_size);
+#ifdef __QUOTA_QT_PARANOIA
+        if (i == qtree_dqstr_in_blk(info)) {
+                printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+                                "but it shouldn't.\n");
+                *err = -EIO;
+                goto out_buf;
+        }
+#endif
+        *err = write_blk(info, blk, buf);
+        if (*err < 0) {
+                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+                                "data block %u.\n", blk);
+                goto out_buf;
+        }
+        dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+                        sizeof(struct qt_disk_dqdbheader) +
+                        i * info->dqi_entry_size;
+        freedqbuf(buf);
+        return blk;
+out_buf:
+        freedqbuf(buf);
+        return 0;
+}
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+                          uint *treeblk, int depth)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        int ret = 0, newson = 0, newact = 0;
+        __le32 *ref;
+        uint newblk;
+        if (!buf)
+                return -ENOMEM;
+        if (!*treeblk) {
+                ret = get_free_dqblk(info);
+                if (ret < 0)
+                        goto out_buf;
+                *treeblk = ret;
+                memset(buf, 0, info->dqi_usable_bs);
+                newact = 1;
+        } else {
+                ret = read_blk(info, *treeblk, buf);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: Can't read tree quota block "
+                                        "%u.\n", *treeblk);
+                        goto out_buf;
+                }
+        }
+        ref = (__le32 *)buf;
+        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+        if (!newblk)
+                newson = 1;
+        if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+                if (newblk) {
+                        printk(KERN_ERR "VFS: Inserting already present quota "
+                                        "entry (block %u).\n",
+                               le32_to_cpu(ref[get_index(info,
+                                                dquot->dq_id, depth)]));
+                        ret = -EIO;
+                        goto out_buf;
+                }
+#endif
+                newblk = find_free_dqentry(info, dquot, &ret);
+        } else {
+                ret = do_insert_tree(info, dquot, &newblk, depth+1);
+        }
+        if (newson && ret >= 0) {
+                ref[get_index(info, dquot->dq_id, depth)] =
+                                                        cpu_to_le32(newblk);
+                ret = write_blk(info, *treeblk, buf);
+        } else if (newact && ret < 0) {
+                put_free_dqblk(info, buf, *treeblk);
+        }
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+                                 struct dquot *dquot)
+{
+        int tmp = QT_TREEOFF;
+        return do_insert_tree(info, dquot, &tmp, 0);
+}
+/*
+ *      We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        int type = dquot->dq_type;
+        struct super_block *sb = dquot->dq_sb;
+        ssize_t ret;
+        dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+        if (!ddquot)
+                return -ENOMEM;
+        /* dq_off is guarded by dqio_mutex */
+        if (!dquot->dq_off) {
+                ret = dq_insert_tree(info, dquot);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: Error %zd occurred while "
+                                        "creating quota.\n", ret);
+                        freedqbuf(ddquot);
+                        return ret;
+                }
+        }
+        spin_lock(&dq_data_lock);
+        info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+        spin_unlock(&dq_data_lock);
+        ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
+                                        info->dqi_entry_size, dquot->dq_off);
+        if (ret != info->dqi_entry_size) {
+                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+                       sb->s_id);
+                if (ret >= 0)
+                        ret = -ENOSPC;
+        } else {
+                ret = 0;
+        }
+        dqstats.writes++;
+        freedqbuf(ddquot);
+        return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+                        uint blk)
+{
+        struct qt_disk_dqdbheader *dh;
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        int ret = 0;
+        if (!buf)
+                return -ENOMEM;
+        if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+                printk(KERN_ERR "VFS: Quota structure has offset to other "
+                  "block (%u) than it should (%u).\n", blk,
+                  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+                goto out_buf;
+        }
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+                goto out_buf;
+        }
+        dh = (struct qt_disk_dqdbheader *)buf;
+        le16_add_cpu(&dh->dqdh_entries, -1);
+        if (!le16_to_cpu(dh->dqdh_entries)) {   /* Block got free? */
+                ret = remove_free_dqentry(info, buf, blk);
+                if (ret >= 0)
+                        ret = put_free_dqblk(info, buf, blk);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+                          "to free list.\n", blk);
+                        goto out_buf;
+                }
+        } else {
+                memset(buf +
+                       (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+                       0, info->dqi_entry_size);
+                if (le16_to_cpu(dh->dqdh_entries) ==
+                    qtree_dqstr_in_blk(info) - 1) {
+                        /* Insert will write block itself */
+                        ret = insert_free_dqentry(info, buf, blk);
+                        if (ret < 0) {
+                                printk(KERN_ERR "VFS: Can't insert quota data "
+                                       "block (%u) to free entry list.\n", blk);
+                                goto out_buf;
+                        }
+                } else {
+                        ret = write_blk(info, blk, buf);
+                        if (ret < 0) {
+                                printk(KERN_ERR "VFS: Can't write quota data "
+                                  "block %u\n", blk);
+                                goto out_buf;
+                        }
+                }
+        }
+        dquot->dq_off = 0;      /* Quota is now unattached */
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+                       uint *blk, int depth)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        int ret = 0;
+        uint newblk;
+        __le32 *ref = (__le32 *)buf;
+        if (!buf)
+                return -ENOMEM;
+        ret = read_blk(info, *blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+                goto out_buf;
+        }
+        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+        if (depth == info->dqi_qtree_depth - 1) {
+                ret = free_dqentry(info, dquot, newblk);
+                newblk = 0;
+        } else {
+                ret = remove_tree(info, dquot, &newblk, depth+1);
+        }
+        if (ret >= 0 && !newblk) {
+                int i;
+                ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+                /* Block got empty? */
+                for (i = 0;
+                     i < (info->dqi_usable_bs >> 2) && !ref[i];
+                     i++);
+                /* Don't put the root block into the free block list */
+                if (i == (info->dqi_usable_bs >> 2)
+                    && *blk != QT_TREEOFF) {
+                        put_free_dqblk(info, buf, *blk);
+                        *blk = 0;
+                } else {
+                        ret = write_blk(info, *blk, buf);
+                        if (ret < 0)
+                                printk(KERN_ERR "VFS: Can't write quota tree "
+                                  "block %u.\n", *blk);
+                }
+        }
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        uint tmp = QT_TREEOFF;
+        if (!dquot->dq_off)     /* Even not allocated? */
+                return 0;
+        return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+                                 struct dquot *dquot, uint blk)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        loff_t ret = 0;
+        int i;
+        char *ddquot;
+        if (!buf)
+                return -ENOMEM;
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                goto out_buf;
+        }
+        for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+             i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
+             i++, ddquot += info->dqi_entry_size);
+        if (i == qtree_dqstr_in_blk(info)) {
+                printk(KERN_ERR "VFS: Quota for id %u referenced "
+                  "but not present.\n", dquot->dq_id);
+                ret = -EIO;
+                goto out_buf;
+        } else {
+                ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+                  qt_disk_dqdbheader) + i * info->dqi_entry_size;
+        }
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+                                struct dquot *dquot, uint blk, int depth)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        loff_t ret = 0;
+        __le32 *ref = (__le32 *)buf;
+        if (!buf)
+                return -ENOMEM;
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                goto out_buf;
+        }
+        ret = 0;
+        blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+        if (!blk)       /* No reference? */
+                goto out_buf;
+        if (depth < info->dqi_qtree_depth - 1)
+                ret = find_tree_dqentry(info, dquot, blk, depth+1);
+        else
+                ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+                                  struct dquot *dquot)
+{
+        return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        int type = dquot->dq_type;
+        struct super_block *sb = dquot->dq_sb;
+        loff_t offset;
+        dqbuf_t ddquot;
+        int ret = 0;
+#ifdef __QUOTA_QT_PARANOIA
+        /* Invalidated quota? */
+        if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+                printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+                return -EIO;
+        }
+#endif
+        /* Do we know offset of the dquot entry in the quota file? */
+        if (!dquot->dq_off) {
+                offset = find_dqentry(info, dquot);
+                if (offset <= 0) {      /* Entry not present? */
+                        if (offset < 0)
+                                printk(KERN_ERR "VFS: Can't read quota "
+                                  "structure for id %u.\n", dquot->dq_id);
+                        dquot->dq_off = 0;
+                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
+                        memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+                        ret = offset;
+                        goto out;
+                }
+                dquot->dq_off = offset;
+        }
+        ddquot = getdqbuf(info->dqi_entry_size);
+        if (!ddquot)
+                return -ENOMEM;
+        ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
+                                   info->dqi_entry_size, dquot->dq_off);
+        if (ret != info->dqi_entry_size) {
+                if (ret >= 0)
+                        ret = -EIO;
+                printk(KERN_ERR "VFS: Error while reading quota "
+                                "structure for id %u.\n", dquot->dq_id);
+                set_bit(DQ_FAKE_B, &dquot->dq_flags);
+                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+                freedqbuf(ddquot);
+                goto out;
+        }
+        spin_lock(&dq_data_lock);
+        info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+        if (!dquot->dq_dqb.dqb_bhardlimit &&
+            !dquot->dq_dqb.dqb_bsoftlimit &&
+            !dquot->dq_dqb.dqb_ihardlimit &&
+            !dquot->dq_dqb.dqb_isoftlimit)
+                set_bit(DQ_FAKE_B, &dquot->dq_flags);
+        spin_unlock(&dq_data_lock);
+        freedqbuf(ddquot);
+out:
+        dqstats.reads++;
+        return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+                return qtree_delete_dquot(info, dquot);
+        return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 000000000000..a1ab8db81a51
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
+/*
+ *      Definitions of structures for vfsv0 quota format
+ */
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+#include <linux/types.h>
+#include <linux/quota.h>
+/*
+ *  Structure of header of block with quota structures. It is padded to 16 bytes so
+ *  there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+        __le32 dqdh_next_free;  /* Number of next block with free entry */
+        __le32 dqdh_prev_free;  /* Number of previous block with free entry */
+        __le16 dqdh_entries;    /* Number of valid entries in block */
+        __le16 dqdh_pad1;
+        __le32 dqdh_pad2;
+};
+#define QT_TREEOFF      1               /* Offset of tree in file in blocks */
+#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb0..b4af1c69ad16 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,25 +3,39 @@
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_v1.h>
-#include <linux/quotaio_v1.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <asm/byteorder.h>
+#include "quotaio_v1.h"
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Old quota format support");
 MODULE_LICENSE("GPL");
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+        return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+        return blocks << QUOTABLOCK_BITS;
+}
 static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
 {
        m->dqb_ihardlimit = d->dqb_ihardlimit;
        m->dqb_isoftlimit = d->dqb_isoftlimit;
        m->dqb_curinodes = d->dqb_curinodes;
-        m->dqb_bhardlimit = d->dqb_bhardlimit;
+        m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
-        m->dqb_bsoftlimit = d->dqb_bsoftlimit;
+        m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
-        m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS;
+        m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
        m->dqb_itime = d->dqb_itime;
        m->dqb_btime = d->dqb_btime;
 }
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
        d->dqb_ihardlimit = m->dqb_ihardlimit;
        d->dqb_isoftlimit = m->dqb_isoftlimit;
        d->dqb_curinodes = m->dqb_curinodes;
-        d->dqb_bhardlimit = m->dqb_bhardlimit;
+        d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
-        d->dqb_bsoftlimit = m->dqb_bsoftlimit;
+        d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
-        d->dqb_curblocks = toqb(m->dqb_curspace);
+        d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
        d->dqb_itime = m->dqb_itime;
        d->dqb_btime = m->dqb_btime;
 }
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d9..b618b563635c 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/dqblk_v2.h>
-#include <linux/quotaio_v2.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -15,16 +14,37 @@
 #include <asm/byteorder.h>
+#include "quota_tree.h"
+#include "quotaio_v2.h"
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Quota format v2 support");
 MODULE_LICENSE("GPL");
 #define __QUOTA_V2_PARANOIA
-typedef char *dqbuf_t;
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2_is_id(void *dp, struct dquot *dquot);
+static struct qtree_fmt_operations v2_qtree_ops = {
+        .mem2disk_dqblk = v2_mem2diskdqb,
+        .disk2mem_dqblk = v2_disk2memdqb,
+        .is_id = v2_is_id,
+};
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
-#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
+static inline qsize_t v2_stoqb(qsize_t space)
-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
+{
+        return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+        return blocks << QUOTABLOCK_BITS;
+}
 /* Check whether given file is really vfsv0 quotafile */
 static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 static int v2_read_file_info(struct super_block *sb, int type)
 {
        struct v2_disk_dqinfo dinfo;
-        struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct qtree_mem_dqinfo *qinfo;
        ssize_t size;
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
                        sb->s_id);
                return -1;
        }
+        info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+        if (!info->dqi_priv) {
+                printk(KERN_WARNING
+                       "Not enough memory for quota information structure.\n");
+                return -1;
+        }
+        qinfo = info->dqi_priv;
        /* limits are stored as unsigned 32-bit data */
        info->dqi_maxblimit = 0xffffffff;
        info->dqi_maxilimit = 0xffffffff;
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
        info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
-        info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        qinfo->dqi_sb = sb;
-        info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        qinfo->dqi_type = type;
-        info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+        qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+        qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+        qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+        qinfo->dqi_ops = &v2_qtree_ops;
        return 0;
 }
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
 static int v2_write_file_info(struct super_block *sb, int type)
 {
        struct v2_disk_dqinfo dinfo;
-        struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
        ssize_t size;
        spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
        dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
        spin_unlock(&dq_data_lock);
-        dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks);
+        dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
-        dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk);
+        dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
-        dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+        dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
        return 0;
 }
-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
+static void v2_disk2memdqb(struct dquot *dquot, void *dp)
 {
+        struct v2_disk_dqblk *d = dp, empty;
+        struct mem_dqblk *m = &dquot->dq_dqb;
        m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
        m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
        m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
        m->dqb_itime = le64_to_cpu(d->dqb_itime);
-        m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
+        m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
-        m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+        m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
        m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
        m->dqb_btime = le64_to_cpu(d->dqb_btime);
+        /* We need to escape back all-zero structure */
+        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+        empty.dqb_itime = cpu_to_le64(1);
+        if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+                m->dqb_itime = 0;
 }
-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
 {
+        struct v2_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        struct qtree_mem_dqinfo *info =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
        d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
        d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
        d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
        d->dqb_itime = cpu_to_le64(m->dqb_itime);
-        d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
+        d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
-        d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+        d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
        d->dqb_btime = cpu_to_le64(m->dqb_btime);
-        d->dqb_id = cpu_to_le32(id);
+        d->dqb_id = cpu_to_le32(dquot->dq_id);
-}
+        if (qtree_entry_unused(info, dp))
+                d->dqb_itime = cpu_to_le64(1);
-static dqbuf_t getdqbuf(void)
-{
-        dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
-        if (!buf)
-                printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
-        return buf;
-}
-static inline void freedqbuf(dqbuf_t buf)
-{
-        kfree(buf);
-}
-static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-        memset(buf, 0, V2_DQBLKSIZE);
-        return sb->s_op->quota_read(sb, type, (char *)buf,
-               V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-        return sb->s_op->quota_write(sb, type, (char *)buf,
-               V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-/* Remove empty block from list and return it */
-static int get_free_dqblk(struct super_block *sb, int type)
-{
-        dqbuf_t buf = getdqbuf();
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-        int ret, blk;
-        if (!buf)
-                return -ENOMEM;
-        if (info->u.v2_i.dqi_free_blk) {
-                blk = info->u.v2_i.dqi_free_blk;
-                if ((ret = read_blk(sb, type, blk, buf)) < 0)
-                        goto out_buf;
-                info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
-        }
-        else {
-                memset(buf, 0, V2_DQBLKSIZE);
-                /* Assure block allocation... */
-                if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
-                        goto out_buf;
-                blk = info->u.v2_i.dqi_blocks++;
-        }
-        mark_info_dirty(sb, type);
-        ret = blk;
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Insert empty block to the list */
-static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-        int err;
-        dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
-        dh->dqdh_prev_free = cpu_to_le32(0);
-        dh->dqdh_entries = cpu_to_le16(0);
-        info->u.v2_i.dqi_free_blk = blk;
-        mark_info_dirty(sb, type);
-        /* Some strange block. We had better leave it... */
-        if ((err = write_blk(sb, type, blk, buf)) < 0)
-                return err;
-        return 0;
 }
-/* Remove given block from the list of blocks with free entries */
+static int v2_is_id(void *dp, struct dquot *dquot)
-static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
 {
-        dqbuf_t tmpbuf = getdqbuf();
+        struct v2_disk_dqblk *d = dp;
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct qtree_mem_dqinfo *info =
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
-        uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
-        int err;
-        if (!tmpbuf)
+        if (qtree_entry_unused(info, dp))
-                return -ENOMEM;
-        if (nextblk) {
-                if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
-                        goto out_buf;
-                ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
-                if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
-                        goto out_buf;
-        }
-        if (prevblk) {
-                if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
-                        goto out_buf;
-                ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
-                if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
-                        goto out_buf;
-        }
-        else {
-                info->u.v2_i.dqi_free_entry = nextblk;
-                mark_info_dirty(sb, type);
-        }
-        freedqbuf(tmpbuf);
-        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
-        /* No matter whether write succeeds block is out of list */
-        if (write_blk(sb, type, blk, buf) < 0)
-                printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
-        return 0;
-out_buf:
-        freedqbuf(tmpbuf);
-        return err;
-}
-/* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-        dqbuf_t tmpbuf = getdqbuf();
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-        int err;
-        if (!tmpbuf)
-                return -ENOMEM;
-        dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
-        dh->dqdh_prev_free = cpu_to_le32(0);
-        if ((err = write_blk(sb, type, blk, buf)) < 0)
-                goto out_buf;
-        if (info->u.v2_i.dqi_free_entry) {
-                if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-                        goto out_buf;
-                ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
-                if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-                        goto out_buf;
-        }
-        freedqbuf(tmpbuf);
-        info->u.v2_i.dqi_free_entry = blk;
-        mark_info_dirty(sb, type);
-        return 0;
-out_buf:
-        freedqbuf(tmpbuf);
-        return err;
-}
-/* Find space for dquot */
-static uint find_free_dqentry(struct dquot *dquot, int *err)
-{
-        struct super_block *sb = dquot->dq_sb;
-        struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
-        uint blk, i;
-        struct v2_disk_dqdbheader *dh;
-        struct v2_disk_dqblk *ddquot;
-        struct v2_disk_dqblk fakedquot;
-        dqbuf_t buf;
-        *err = 0;
-        if (!(buf = getdqbuf())) {
-                *err = -ENOMEM;
                return 0;
-        }
+        return le32_to_cpu(d->dqb_id) == dquot->dq_id;
-        dh = (struct v2_disk_dqdbheader *)buf;
-        ddquot = GETENTRIES(buf);
-        if (info->u.v2_i.dqi_free_entry) {
-                blk = info->u.v2_i.dqi_free_entry;
-                if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
-                        goto out_buf;
-        }
-        else {
-                blk = get_free_dqblk(sb, dquot->dq_type);
-                if ((int)blk < 0) {
-                        *err = blk;
-                        freedqbuf(buf);
-                        return 0;
-                }
-                memset(buf, 0, V2_DQBLKSIZE);
-                /* This is enough as block is already zeroed and entry list is empty... */
-                info->u.v2_i.dqi_free_entry = blk;
-                mark_info_dirty(sb, dquot->dq_type);
-        }
-        if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK)   /* Block will be full? */
-                if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
-                        printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
-                        goto out_buf;
-                }
-        le16_add_cpu(&dh->dqdh_entries, 1);
-        memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-        /* Find free structure in block */
-        for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
-#ifdef __QUOTA_V2_PARANOIA
-        if (i == V2_DQSTRINBLK) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
-                *err = -EIO;
-                goto out_buf;
-        }
-#endif
-        if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
-                goto out_buf;
-        }
-        dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
-        freedqbuf(buf);
-        return blk;
-out_buf:
-        freedqbuf(buf);
-        return 0;
-}
-/* Insert reference to structure into the trie */
-static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
-{
-        struct super_block *sb = dquot->dq_sb;
-        dqbuf_t buf;
-        int ret = 0, newson = 0, newact = 0;
-        __le32 *ref;
-        uint newblk;
-        if (!(buf = getdqbuf()))
-                return -ENOMEM;
-        if (!*treeblk) {
-                ret = get_free_dqblk(sb, dquot->dq_type);
-                if (ret < 0)
-                        goto out_buf;
-                *treeblk = ret;
-                memset(buf, 0, V2_DQBLKSIZE);
-                newact = 1;
-        }
-        else {
-                if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
-                        printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
-                        goto out_buf;
-                }
-        }
-        ref = (__le32 *)buf;
-        newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-        if (!newblk)
-                newson = 1;
-        if (depth == V2_DQTREEDEPTH-1) {
-#ifdef __QUOTA_V2_PARANOIA
-                if (newblk) {
-                        printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
-                        ret = -EIO;
-                        goto out_buf;
-                }
-#endif
-                newblk = find_free_dqentry(dquot, &ret);
-        }
-        else
-                ret = do_insert_tree(dquot, &newblk, depth+1);
-        if (newson && ret >= 0) {
-                ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
-                ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
-        }
-        else if (newact && ret < 0)
-                put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
-out_buf:
-        freedqbuf(buf);
-        return ret;
 }
-/* Wrapper for inserting quota structure into tree */
+static int v2_read_dquot(struct dquot *dquot)
-static inline int dq_insert_tree(struct dquot *dquot)
 {
-        int tmp = V2_DQTREEOFF;
+        return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
-        return do_insert_tree(dquot, &tmp, 0);
 }
-/*
- *      We don't have to be afraid of deadlocks as we never have quotas on quota files...
- */
 static int v2_write_dquot(struct dquot *dquot)
 {
-        int type = dquot->dq_type;
+        return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
-        ssize_t ret;
-        struct v2_disk_dqblk ddquot, empty;
-        /* dq_off is guarded by dqio_mutex */
-        if (!dquot->dq_off)
-                if ((ret = dq_insert_tree(dquot)) < 0) {
-                        printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
-                        return ret;
-                }
-        spin_lock(&dq_data_lock);
-        mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
-        /* Argh... We may need to write structure full of zeroes but that would be
-         * treated as an empty place by the rest of the code. Format change would
-         * be definitely cleaner but the problems probably are not worth it */
-        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-        if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-                ddquot.dqb_itime = cpu_to_le64(1);
-        spin_unlock(&dq_data_lock);
-        ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
-              (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
-        if (ret != sizeof(struct v2_disk_dqblk)) {
-                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
-                if (ret >= 0)
-                        ret = -ENOSPC;
-        }
-        else
-                ret = 0;
-        dqstats.writes++;
-        return ret;
 }
-/* Free dquot entry in data block */
+static int v2_release_dquot(struct dquot *dquot)
-static int free_dqentry(struct dquot *dquot, uint blk)
-{
-        struct super_block *sb = dquot->dq_sb;
-        int type = dquot->dq_type;
-        struct v2_disk_dqdbheader *dh;
-        dqbuf_t buf = getdqbuf();
-        int ret = 0;
-        if (!buf)
-                return -ENOMEM;
-        if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
-                printk(KERN_ERR "VFS: Quota structure has offset to other "
-                  "block (%u) than it should (%u).\n", blk,
-                  (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
-                goto out_buf;
-        }
-        if ((ret = read_blk(sb, type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
-                goto out_buf;
-        }
-        dh = (struct v2_disk_dqdbheader *)buf;
-        le16_add_cpu(&dh->dqdh_entries, -1);
-        if (!le16_to_cpu(dh->dqdh_entries)) {   /* Block got free? */
-                if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
-                    (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
-                        printk(KERN_ERR "VFS: Can't move quota data block (%u) "
-                          "to free list.\n", blk);
-                        goto out_buf;
-                }
-        }
-        else {
-                memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
-                  sizeof(struct v2_disk_dqblk));
-                if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
-                        /* Insert will write block itself */
-                        if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
-                                printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
-                                goto out_buf;
-                        }
-                }
-                else
-                        if ((ret = write_blk(sb, type, blk, buf)) < 0) {
-                                printk(KERN_ERR "VFS: Can't write quota data "
-                                  "block %u\n", blk);
-                                goto out_buf;
-                        }
-        }
-        dquot->dq_off = 0;      /* Quota is now unattached */
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Remove reference to dquot from tree */
-static int remove_tree(struct dquot *dquot, uint *blk, int depth)
-{
-        struct super_block *sb = dquot->dq_sb;
-        int type = dquot->dq_type;
-        dqbuf_t buf = getdqbuf();
-        int ret = 0;
-        uint newblk;
-        __le32 *ref = (__le32 *)buf;
-        
-        if (!buf)
-                return -ENOMEM;
-        if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
-                goto out_buf;
-        }
-        newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-        if (depth == V2_DQTREEDEPTH-1) {
-                ret = free_dqentry(dquot, newblk);
-                newblk = 0;
-        }
-        else
-                ret = remove_tree(dquot, &newblk, depth+1);
-        if (ret >= 0 && !newblk) {
-                int i;
-                ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
-                for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++);  /* Block got empty? */
-                /* Don't put the root block into the free block list */
-                if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
-                        put_free_dqblk(sb, type, buf, *blk);
-                        *blk = 0;
-                }
-                else
-                        if ((ret = write_blk(sb, type, *blk, buf)) < 0)
-                                printk(KERN_ERR "VFS: Can't write quota tree "
-                                  "block %u.\n", *blk);
-        }
-out_buf:
-        freedqbuf(buf);
-        return ret;     
-}
-/* Delete dquot from tree */
-static int v2_delete_dquot(struct dquot *dquot)
-{
-        uint tmp = V2_DQTREEOFF;
-        if (!dquot->dq_off)     /* Even not allocated? */
-                return 0;
-        return remove_tree(dquot, &tmp, 0);
-}
-/* Find entry in block */
-static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
-{
-        dqbuf_t buf = getdqbuf();
-        loff_t ret = 0;
-        int i;
-        struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
-        if (!buf)
-                return -ENOMEM;
-        if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-                goto out_buf;
-        }
-        if (dquot->dq_id)
-                for (i = 0; i < V2_DQSTRINBLK &&
-                     le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
-        else {  /* ID 0 as a bit more complicated searching... */
-                struct v2_disk_dqblk fakedquot;
-                memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-                for (i = 0; i < V2_DQSTRINBLK; i++)
-                        if (!le32_to_cpu(ddquot[i].dqb_id) &&
-                            memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
-                                break;
-        }
-        if (i == V2_DQSTRINBLK) {
-                printk(KERN_ERR "VFS: Quota for id %u referenced "
-                  "but not present.\n", dquot->dq_id);
-                ret = -EIO;
-                goto out_buf;
-        }
-        else
-                ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
-                  v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Find entry for given id in the tree */
-static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
-{
-        dqbuf_t buf = getdqbuf();
-        loff_t ret = 0;
-        __le32 *ref = (__le32 *)buf;
-        if (!buf)
-                return -ENOMEM;
-        if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-                goto out_buf;
-        }
-        ret = 0;
-        blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-        if (!blk)       /* No reference? */
-                goto out_buf;
-        if (depth < V2_DQTREEDEPTH-1)
-                ret = find_tree_dqentry(dquot, blk, depth+1);
-        else
-                ret = find_block_dqentry(dquot, blk);
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Find entry for given id in the tree - wrapper function */
-static inline loff_t find_dqentry(struct dquot *dquot)
-{
-        return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
-}
-static int v2_read_dquot(struct dquot *dquot)
 {
-        int type = dquot->dq_type;
+        return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
-        loff_t offset;
-        struct v2_disk_dqblk ddquot, empty;
-        int ret = 0;
-#ifdef __QUOTA_V2_PARANOIA
-        /* Invalidated quota? */
-        if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
-                printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
-                return -EIO;
-        }
-#endif
-        offset = find_dqentry(dquot);
-        if (offset <= 0) {      /* Entry not present? */
-                if (offset < 0)
-                        printk(KERN_ERR "VFS: Can't read quota "
-                          "structure for id %u.\n", dquot->dq_id);
-                dquot->dq_off = 0;
-                set_bit(DQ_FAKE_B, &dquot->dq_flags);
-                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
-                ret = offset;
-        }
-        else {
-                dquot->dq_off = offset;
-                if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
-                    (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
-                    != sizeof(struct v2_disk_dqblk)) {
-                        if (ret >= 0)
-                                ret = -EIO;
-                        printk(KERN_ERR "VFS: Error while reading quota "
-                          "structure for id %u.\n", dquot->dq_id);
-                        memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
-                }
-                else {
-                        ret = 0;
-                        /* We need to escape back all-zero structure */
-                        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-                        empty.dqb_itime = cpu_to_le64(1);
-                        if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-                                ddquot.dqb_itime = 0;
-                }
-                disk2memdqb(&dquot->dq_dqb, &ddquot);
-                if (!dquot->dq_dqb.dqb_bhardlimit &&
-                        !dquot->dq_dqb.dqb_bsoftlimit &&
-                        !dquot->dq_dqb.dqb_ihardlimit &&
-                        !dquot->dq_dqb.dqb_isoftlimit)
-                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
-        }
-        dqstats.reads++;
-        return ret;
 }
-/* Check whether dquot should not be deleted. We know we are
+static int v2_free_file_info(struct super_block *sb, int type)
- * the only one operating on dquot (thanks to dq_lock) */
-static int v2_release_dquot(struct dquot *dquot)
 {
-        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+        kfree(sb_dqinfo(sb, type)->dqi_priv);
-                return v2_delete_dquot(dquot);
        return 0;
 }
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
        .check_quota_file       = v2_check_quota_file,
        .read_file_info         = v2_read_file_info,
        .write_file_info        = v2_write_file_info,
-        .free_file_info         = NULL,
+        .free_file_info         = v2_free_file_info,
        .read_dqblk             = v2_read_dquot,
        .commit_dqblk           = v2_write_dquot,
        .release_dqblk          = v2_release_dquot,
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 000000000000..746654b5de70
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_QUOTAIO_V1_H
+#define _LINUX_QUOTAIO_V1_H
+#include <linux/types.h>
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is an array of these structures
+ * indexed by user or group number.
+ */
+struct v1_disk_dqblk {
+        __u32 dqb_bhardlimit;   /* absolute limit on disk blks alloc */
+        __u32 dqb_bsoftlimit;   /* preferred limit on disk blks */
+        __u32 dqb_curblocks;    /* current block count */
+        __u32 dqb_ihardlimit;   /* absolute limit on allocated inodes */
+        __u32 dqb_isoftlimit;   /* preferred inode limit */
+        __u32 dqb_curinodes;    /* current # allocated inodes */
+        time_t dqb_btime;       /* time limit for excessive disk use */
+        time_t dqb_itime;       /* time limit for excessive inode use */
+};
+#define v1_dqoff(UID)      ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
+#endif  /* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 000000000000..530fe580685c
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,60 @@
+/*
+ *      Definitions of structures for vfsv0 quota format
+ */
+#ifndef _LINUX_QUOTAIO_V2_H
+#define _LINUX_QUOTAIO_V2_H
+#include <linux/types.h>
+#include <linux/quota.h>
+/*
+ * Definitions of magics and versions of current quota files
+ */
+#define V2_INITQMAGICS {\
+        0xd9c01f11,     /* USRQUOTA */\
+        0xd9c01927      /* GRPQUOTA */\
+}
+#define V2_INITQVERSIONS {\
+        0,              /* USRQUOTA */\
+        0               /* GRPQUOTA */\
+}
+/* First generic header */
+struct v2_disk_dqheader {
+        __le32 dqh_magic;       /* Magic number identifying file */
+        __le32 dqh_version;     /* File version */
+};
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is a radix tree whose leaves point
+ * to blocks of these structures.
+ */
+struct v2_disk_dqblk {
+        __le32 dqb_id;          /* id this quota applies to */
+        __le32 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+        __le32 dqb_isoftlimit;  /* preferred inode limit */
+        __le32 dqb_curinodes;   /* current # allocated inodes */
+        __le32 dqb_bhardlimit;  /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+        __le32 dqb_bsoftlimit;  /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+        __le64 dqb_curspace;    /* current space occupied (in bytes) */
+        __le64 dqb_btime;       /* time limit for excessive disk use */
+        __le64 dqb_itime;       /* time limit for excessive inode use */
+};
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+        __le32 dqi_bgrace;      /* Time before block soft limit becomes hard limit */
+        __le32 dqi_igrace;      /* Time before inode soft limit becomes hard limit */
+        __le32 dqi_flags;       /* Flags for quotafile (DQF_*) */
+        __le32 dqi_blocks;      /* Number of blocks in file */
+        __le32 dqi_free_blk;    /* Number of first free block in the list */
+        __le32 dqi_free_entry;  /* Number of block with at least one free entry */
+};
+#define V2_DQINFOOFF    sizeof(struct v2_disk_dqheader) /* Offset of info header in file */
+#define V2_DQBLKSIZE_BITS 10                            /* Size of leaf block in tree */
+#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 76acdbc34611..b9b567a28376 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
        ret = -ENOMEM;
        pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
        if (!pages)
-                goto out;
+                goto out_free;
        nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
        if (nr != lpages)
-                goto out; /* leave if some pages were missing */
+                goto out_free_pages; /* leave if some pages were missing */
        /* check the pages for physical adjacency */
        ptr = pages;
@@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
        page++;
        for (loop = lpages; loop > 1; loop--)
                if (*ptr++ != page++)
-                        goto out;
+                        goto out_free_pages;
        /* okay - all conditions fulfilled */
        ret = (unsigned long) page_address(pages[0]);
- out:
+out_free_pages:
-        if (pages) {
+        ptr = pages;
-                ptr = pages;
+        for (loop = nr; loop > 0; loop--)
-                for (loop = lpages; loop > 0; loop--)
+                put_page(*ptr++);
-                        put_page(*ptr++);
+out_free:
-                kfree(pages);
+        kfree(pages);
-        }
+out:
        return ret;
 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae33..b7e6ac706b87 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
-                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020b..400fe81c973e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                offset += inode->i_size;
                break;
        case SEEK_CUR:
+                /*
+                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+                 * position-querying operation.  Avoid rewriting the "same"
+                 * f_pos value back to the file because a concurrent read(),
+                 * write() or lseek() might have altered it
+                 */
+                if (offset == 0)
+                        return file->f_pos;
                offset += file->f_pos;
                break;
        }
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                        offset += i_size_read(file->f_path.dentry->d_inode);
                        break;
                case SEEK_CUR:
+                        if (offset == 0) {
+                                retval = file->f_pos;
+                                goto out;
+                        }
                        offset += file->f_pos;
        }
        retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                }
                retval = offset;
        }
+out:
        unlock_kernel();
        return retval;
 }
@@ -134,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(vfs_llseek);
-asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
+SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 {
        off_t retval;
        struct file * file;
@@ -158,9 +171,9 @@ bad:
 }
 #ifdef __ARCH_WANT_SYS_LLSEEK
-asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
+SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
-                           unsigned long offset_low, loff_t __user * result,
+                unsigned long, offset_low, loff_t __user *, result,
-                           unsigned int origin)
+                unsigned int, origin)
 {
        int retval;
        struct file * file;
@@ -356,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
        file->f_pos = pos;
 }
-asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
+SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 {
        struct file *file;
        ssize_t ret = -EBADF;
@@ -373,7 +386,8 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
        return ret;
 }
-asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
+SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
+                size_t, count)
 {
        struct file *file;
        ssize_t ret = -EBADF;
@@ -390,8 +404,8 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co
        return ret;
 }
-asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
+SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
-                             size_t count, loff_t pos)
+                        size_t count, loff_t pos)
 {
        struct file *file;
        ssize_t ret = -EBADF;
@@ -410,9 +424,17 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
        return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
+{
+        return SYSC_pread64((unsigned int) fd, (char __user *) buf,
+                            (size_t) count, pos);
+}
+SYSCALL_ALIAS(sys_pread64, SyS_pread64);
+#endif
-asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf,
+SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
-                              size_t count, loff_t pos)
+                         size_t count, loff_t pos)
 {
        struct file *file;
        ssize_t ret = -EBADF;
@@ -431,6 +453,14 @@ asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf,
        return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
+{
+        return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
+                             (size_t) count, pos);
+}
+SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
+#endif
 /*
 * Reduce an iovec's length in-place.  Return the resulting number of segments
@@ -659,8 +689,8 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 EXPORT_SYMBOL(vfs_writev);
-asmlinkage ssize_t
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
-sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
+                unsigned long, vlen)
 {
        struct file *file;
        ssize_t ret = -EBADF;
@@ -680,8 +710,8 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
        return ret;
 }
-asmlinkage ssize_t
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
-sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
+                unsigned long, vlen)
 {
        struct file *file;
        ssize_t ret = -EBADF;
@@ -799,7 +829,7 @@ out:
        return retval;
 }
-asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count)
+SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
 {
        loff_t pos;
        off_t off;
@@ -818,7 +848,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, siz
        return do_sendfile(out_fd, in_fd, NULL, count, 0);
 }
-asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count)
+SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
 {
        loff_t pos;
        ssize_t ret;
diff --git a/fs/readdir.c b/fs/readdir.c
index b318d9b5af2e..7723401f8d8b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -102,7 +102,8 @@ efault:
        return -EFAULT;
 }
-asmlinkage long old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count)
+SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+                struct old_linux_dirent __user *, dirent, unsigned int, count)
 {
        int error;
        struct file * file;
@@ -187,7 +188,8 @@ efault:
        return -EFAULT;
 }
-asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * dirent, unsigned int count)
+SYSCALL_DEFINE3(getdents, unsigned int, fd,
+                struct linux_dirent __user *, dirent, unsigned int, count)
 {
        struct file * file;
        struct linux_dirent __user * lastdirent;
@@ -268,7 +270,8 @@ efault:
        return -EFAULT;
 }
-asmlinkage long sys_getdents64(unsigned int fd, struct linux_dirent64 __user * dirent, unsigned int count)
+SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+                struct linux_dirent64 __user *, dirent, unsigned int, count)
 {
        struct file * file;
        struct linux_dirent64 __user * lastdirent;
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
new file mode 100644
index 000000000000..949b8c6addc8
--- /dev/null
+++ b/fs/reiserfs/Kconfig
@@ -0,0 +1,85 @@
+config REISERFS_FS
+        tristate "Reiserfs support"
+        help
+          Stores not just filenames but the files themselves in a balanced
+          tree.  Uses journalling.
+          Balanced trees are more efficient than traditional file system
+          architectural foundations.
+          In general, ReiserFS is as fast as ext2, but is very efficient with
+          large directories and small files.  Additional patches are needed
+          for NFS and quotas, please see <http://www.namesys.com/> for links.
+          It is more easily extended to have features currently found in
+          database and keyword search systems than block allocation based file
+          systems are.  The next version will be so extended, and will support
+          plugins consistent with our motto ``It takes more than a license to
+          make source code open.''
+          Read <http://www.namesys.com/> to learn more about reiserfs.
+          Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
+          If you like it, you can pay us to add new features to it that you
+          need, buy a support contract, or pay us to port it to another OS.
+config REISERFS_CHECK
+        bool "Enable reiserfs debug mode"
+        depends on REISERFS_FS
+        help
+          If you set this to Y, then ReiserFS will perform every check it can
+          possibly imagine of its internal consistency throughout its
+          operation.  It will also go substantially slower.  More than once we
+          have forgotten that this was on, and then gone despondent over the
+          latest benchmarks.:-) Use of this option allows our team to go all
+          out in checking for consistency when debugging without fear of its
+          effect on end users.  If you are on the verge of sending in a bug
+          report, say Y and you might get a useful error message.  Almost
+          everyone should say N.
+config REISERFS_PROC_INFO
+        bool "Stats in /proc/fs/reiserfs"
+        depends on REISERFS_FS && PROC_FS
+        help
+          Create under /proc/fs/reiserfs a hierarchy of files, displaying
+          various ReiserFS statistics and internal data at the expense of
+          making your kernel or module slightly larger (+8 KB). This also
+          increases the amount of kernel memory required for each mount.
+          Almost everyone but ReiserFS developers and people fine-tuning
+          reiserfs or tracing problems should say N.
+config REISERFS_FS_XATTR
+        bool "ReiserFS extended attributes"
+        depends on REISERFS_FS
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).
+          If unsure, say N.
+config REISERFS_FS_POSIX_ACL
+        bool "ReiserFS POSIX Access Control Lists"
+        depends on REISERFS_FS_XATTR
+        select FS_POSIX_ACL
+        help
+          Posix Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the Posix ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N
+config REISERFS_FS_SECURITY
+        bool "ReiserFS Security Labels"
+        depends on REISERFS_FS_XATTR
+        help
+          Security labels support alternative access control models
+          implemented by security modules like SELinux.  This option
+          enables an extended attribute handler for file security
+          labels in the ReiserFS filesystem.
+          If you are not using a security module that requires using
+          extended attributes for file security labels, say N.
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6c4c2c69449f..55fce92cdf18 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                       struct inode *inode)
 {
        struct super_block *sb;
+        struct reiserfs_iget_args args;
        INITIALIZE_PATH(path_to_key);
        struct cpu_key key;
        struct item_head ih;
@@ -1780,6 +1781,20 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                err = -ENOMEM;
                goto out_bad_inode;
        }
+        args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+        if (old_format_only(sb))
+                make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+                                  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+        else
+                make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+                                  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
+        memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+        args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
+        if (insert_inode_locked4(inode, args.objectid,
+                             reiserfs_find_actor, &args) < 0) {
+                err = -EINVAL;
+                goto out_bad_inode;
+        }
        if (old_format_only(sb))
                /* not a perfect generation count, as object ids can be reused, but 
                 ** this is as good as reiserfs can do right now.
@@ -1825,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        reiserfs_init_acl_default(inode);
        reiserfs_init_xattr_rwsem(inode);
-        if (old_format_only(sb))
-                make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
-                                  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
-        else
-                make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
-                                  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
        /* key to search for correct place for new stat data */
        _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
                      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
@@ -1859,13 +1867,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        } else {
                inode2sd(&sd, inode, inode->i_size);
        }
-        // these do not go to on-disk stat data
-        inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
        // store in in-core inode the key of stat data and version all
        // object items will have (directory items will have old offset
        // format, other new objects will consist of new items)
-        memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
        if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
                set_inode_item_key_version(inode, KEY_FORMAT_3_5);
        else
@@ -1929,7 +1933,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                reiserfs_mark_inode_private(inode);
        }
-        insert_inode_hash(inode);
        reiserfs_update_sd(th, inode);
        reiserfs_check_path(&path_to_key);
@@ -1956,6 +1959,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
      out_inserted_sd:
        inode->i_nlink = 0;
        th->t_trans_id = 0;     /* so the caller can't use this handle later */
+        unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
        /* If we were inheriting an ACL, we need to release the lock so that
         * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
@@ -2556,7 +2560,7 @@ static int reiserfs_write_begin(struct file *file,
        }
        index = pos >> PAGE_CACHE_SHIFT;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 4f322e5ed840..738967f6c8ee 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
@@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
        reiserfs_update_inode_transaction(dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
      out_failed:
@@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
      out_failed:
@@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
@@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        reiserfs_update_sd(&th, dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
      out_failed:
        if (locked)
@@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
                err = journal_end(&th, parent_dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
      out_failed:
        reiserfs_write_unlock(parent_dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce8..f3c820b75829 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s)
        reiserfs_sync_fs(s, 1);
 }
-static void reiserfs_write_super_lockfs(struct super_block *s)
+static int reiserfs_freeze(struct super_block *s)
 {
        struct reiserfs_transaction_handle th;
        reiserfs_write_lock(s);
@@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s)
        }
        s->s_dirt = 0;
        reiserfs_write_unlock(s);
+        return 0;
 }
-static void reiserfs_unlockfs(struct super_block *s)
+static int reiserfs_unfreeze(struct super_block *s)
 {
        reiserfs_allow_writes(s);
+        return 0;
 }
 extern const struct in_core_key MAX_IN_CORE_KEY;
@@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = {
        .put_super = reiserfs_put_super,
        .write_super = reiserfs_write_super,
        .sync_fs = reiserfs_sync_fs,
-        .write_super_lockfs = reiserfs_write_super_lockfs,
+        .freeze_fs = reiserfs_freeze,
-        .unlockfs = reiserfs_unlockfs,
+        .unfreeze_fs = reiserfs_unfreeze,
        .statfs = reiserfs_statfs,
        .remount_fs = reiserfs_remount,
        .show_options = generic_show_options,
@@ -649,6 +651,8 @@ static struct dquot_operations reiserfs_quota_operations = {
        .release_dquot = reiserfs_release_dquot,
        .mark_dirty = reiserfs_mark_dquot_dirty,
        .write_info = reiserfs_write_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
 static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +998,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                if (c == 'u' || c == 'g') {
                        int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
-                        if ((sb_any_quota_enabled(s) ||
+                        if (sb_any_quota_loaded(s) &&
-                             sb_any_quota_suspended(s)) &&
                            (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
                                reiserfs_warning(s,
                                                 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1044,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                                                 "reiserfs_parse_options: unknown quota format specified.");
                                return 0;
                        }
-                        if ((sb_any_quota_enabled(s) ||
+                        if (sb_any_quota_loaded(s) &&
-                             sb_any_quota_suspended(s)) &&
                            *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
                                reiserfs_warning(s,
                                                 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1069,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
        }
        /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
        if (!(*mount_options & (1 << REISERFS_QUOTA))
-            && sb_any_quota_enabled(s)) {
+            && sb_any_quota_loaded(s)) {
                reiserfs_warning(s,
                                 "reiserfs_parse_options: quota options must be present when quota is turned on.");
                return 0;
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
new file mode 100644
index 000000000000..1a17020f9faf
--- /dev/null
+++ b/fs/romfs/Kconfig
@@ -0,0 +1,16 @@
+config ROMFS_FS
+        tristate "ROM file system support"
+        depends on BLOCK
+        ---help---
+          This is a very small read-only file system mainly intended for
+          initial ram disks of installation disks, but it could be used for
+          other read-only media as well.  Read
+          <file:Documentation/filesystems/romfs.txt> for details.
+          To compile this file system support as a module, choose M here: the
+          module will be called romfs.  Note that the file system of your
+          root partition (the one containing the directory /) cannot be a
+          module.
+          If you don't know whether you need it, then you don't need it:
+          answer N.
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87b..98a232f7196b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
 static struct inode *
 romfs_iget(struct super_block *sb, unsigned long ino)
 {
-        int nextfh;
+        int nextfh, ret;
        struct romfs_inode ri;
        struct inode *i;
@@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino)
        i->i_size = be32_to_cpu(ri.size);
        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-        i->i_uid = i->i_gid = 0;
        /* Precalculate the data offset */
-        ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN);
+        ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-        if (ino >= 0)
+        if (ret >= 0)
-                ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK);
+                ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-        else
+        else
-                ino = 0;
+                ino = 0;
        ROMFS_I(i)->i_metasize = ino;
        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf2..0fe0e1469df3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 void poll_initwait(struct poll_wqueues *pwq)
 {
        init_poll_funcptr(&pwq->pt, __pollwait);
+        pwq->polling_task = current;
        pwq->error = 0;
        pwq->table = NULL;
        pwq->inline_index = 0;
 }
 EXPORT_SYMBOL(poll_initwait);
 static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
                free_page((unsigned long) old);
        }
 }
 EXPORT_SYMBOL(poll_freewait);
-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 {
-        struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
        struct poll_table_page *table = p->table;
        if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
                new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
                if (!new_table) {
                        p->error = -ENOMEM;
-                        __set_current_state(TASK_RUNNING);
                        return NULL;
                }
                new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
        return table->entry++;
 }
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+        struct poll_wqueues *pwq = wait->private;
+        DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+        /*
+         * Although this function is called under waitqueue lock, LOCK
+         * doesn't imply write barrier and the users expect write
+         * barrier semantics on wakeup functions.  The following
+         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+         * and is paired with set_mb() in poll_schedule_timeout.
+         */
+        smp_wmb();
+        pwq->triggered = 1;
+        /*
+         * Perform the default wake up operation using a dummy
+         * waitqueue.
+         *
+         * TODO: This is hacky but there currently is no interface to
+         * pass in @sync.  @sync is scheduled to be removed and once
+         * that happens, wake_up_process() can be used directly.
+         */
+        return default_wake_function(&dummy_wait, mode, sync, key);
+}
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                                poll_table *p)
 {
-        struct poll_table_entry *entry = poll_get_entry(p);
+        struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+        struct poll_table_entry *entry = poll_get_entry(pwq);
        if (!entry)
                return;
        get_file(filp);
        entry->filp = filp;
        entry->wait_address = wait_address;
-        init_waitqueue_entry(&entry->wait, current);
+        init_waitqueue_func_entry(&entry->wait, pollwake);
+        entry->wait.private = pwq;
        add_wait_queue(wait_address, &entry->wait);
 }
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+                          ktime_t *expires, unsigned long slack)
+{
+        int rc = -EINTR;
+        set_current_state(state);
+        if (!pwq->triggered)
+                rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+        __set_current_state(TASK_RUNNING);
+        /*
+         * Prepare for the next iteration.
+         *
+         * The following set_mb() serves two purposes.  First, it's
+         * the counterpart rmb of the wmb in pollwake() such that data
+         * written before wake up is always visible after wake up.
+         * Second, the full barrier guarantees that triggered clearing
+         * doesn't pass event check of the next iteration.  Note that
+         * this problem doesn't exist for the first iteration as
+         * add_wait_queue() has full barrier semantics.
+         */
+        set_mb(pwq->triggered, 0);
+        return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
 /**
 * poll_select_set_timeout - helper function to setup the timeout value
 * @to:         pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        for (;;) {
                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
-                set_current_state(TASK_INTERRUPTIBLE);
                inp = fds->in; outp = fds->out; exp = fds->ex;
                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                        to = &expire;
                }
-                if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+                                           to, slack))
                        timed_out = 1;
        }
-        __set_current_state(TASK_RUNNING);
        poll_freewait(&table);
@@ -507,8 +557,8 @@ out_nofds:
        return ret;
 }
-asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
+SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
-                        fd_set __user *exp, struct timeval __user *tvp)
+                fd_set __user *, exp, struct timeval __user *, tvp)
 {
        struct timespec end_time, *to = NULL;
        struct timeval tv;
@@ -532,9 +582,9 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 }
 #ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
+static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
-                fd_set __user *exp, struct timespec __user *tsp,
+                       fd_set __user *exp, struct timespec __user *tsp,
-                const sigset_t __user *sigmask, size_t sigsetsize)
+                       const sigset_t __user *sigmask, size_t sigsetsize)
 {
        sigset_t ksigmask, sigsaved;
        struct timespec ts, end_time, *to = NULL;
@@ -560,7 +610,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
                sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
        }
-        ret = core_sys_select(n, inp, outp, exp, &end_time);
+        ret = core_sys_select(n, inp, outp, exp, to);
        ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
        if (ret == -ERESTARTNOHAND) {
@@ -586,8 +636,9 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
-asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
+SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
-        fd_set __user *exp, struct timespec __user *tsp, void __user *sig)
+                fd_set __user *, exp, struct timespec __user *, tsp,
+                void __user *, sig)
 {
        size_t sigsetsize = 0;
        sigset_t __user *up = NULL;
@@ -600,7 +651,7 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
                        return -EFAULT;
        }
-        return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
+        return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
@@ -666,7 +717,6 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
        for (;;) {
                struct poll_list *walk;
-                set_current_state(TASK_INTERRUPTIBLE);
                for (walk = list; walk != NULL; walk = walk->next) {
                        struct pollfd * pfd, * pfd_end;
@@ -709,10 +759,9 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
                        to = &expire;
                }
-                if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
                        timed_out = 1;
        }
-        __set_current_state(TASK_RUNNING);
        return count;
 }
@@ -806,8 +855,8 @@ static long do_restart_poll(struct restart_block *restart_block)
        return ret;
 }
-asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
-                        long timeout_msecs)
+                long, timeout_msecs)
 {
        struct timespec end_time, *to = NULL;
        int ret;
@@ -841,9 +890,9 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 }
 #ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
+SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
-        struct timespec __user *tsp, const sigset_t __user *sigmask,
+                struct timespec __user *, tsp, const sigset_t __user *, sigmask,
-        size_t sigsetsize)
+                size_t, sigsetsize)
 {
        sigset_t ksigmask, sigsaved;
        struct timespec ts, end_time, *to = NULL;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 16c211558c22..5267098532bf 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -54,6 +54,64 @@ int seq_open(struct file *file, const struct seq_operations *op)
 }
 EXPORT_SYMBOL(seq_open);
+static int traverse(struct seq_file *m, loff_t offset)
+{
+        loff_t pos = 0, index;
+        int error = 0;
+        void *p;
+        m->version = 0;
+        index = 0;
+        m->count = m->from = 0;
+        if (!offset) {
+                m->index = index;
+                return 0;
+        }
+        if (!m->buf) {
+                m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+                if (!m->buf)
+                        return -ENOMEM;
+        }
+        p = m->op->start(m, &index);
+        while (p) {
+                error = PTR_ERR(p);
+                if (IS_ERR(p))
+                        break;
+                error = m->op->show(m, p);
+                if (error < 0)
+                        break;
+                if (unlikely(error)) {
+                        error = 0;
+                        m->count = 0;
+                }
+                if (m->count == m->size)
+                        goto Eoverflow;
+                if (pos + m->count > offset) {
+                        m->from = offset - pos;
+                        m->count -= m->from;
+                        m->index = index;
+                        break;
+                }
+                pos += m->count;
+                m->count = 0;
+                if (pos == offset) {
+                        index++;
+                        m->index = index;
+                        break;
+                }
+                p = m->op->next(m, p, &index);
+        }
+        m->op->stop(m, p);
+        m->index = index;
+        return error;
+Eoverflow:
+        m->op->stop(m, p);
+        kfree(m->buf);
+        m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+        return !m->buf ? -ENOMEM : -EAGAIN;
+}
 /**
 *      seq_read -      ->read() method for sequential files.
 *      @file: the file to read from
@@ -186,63 +244,6 @@ Efault:
 }
 EXPORT_SYMBOL(seq_read);
-static int traverse(struct seq_file *m, loff_t offset)
-{
-        loff_t pos = 0, index;
-        int error = 0;
-        void *p;
-        m->version = 0;
-        index = 0;
-        m->count = m->from = 0;
-        if (!offset) {
-                m->index = index;
-                return 0;
-        }
-        if (!m->buf) {
-                m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
-                if (!m->buf)
-                        return -ENOMEM;
-        }
-        p = m->op->start(m, &index);
-        while (p) {
-                error = PTR_ERR(p);
-                if (IS_ERR(p))
-                        break;
-                error = m->op->show(m, p);
-                if (error < 0)
-                        break;
-                if (unlikely(error)) {
-                        error = 0;
-                        m->count = 0;
-                }
-                if (m->count == m->size)
-                        goto Eoverflow;
-                if (pos + m->count > offset) {
-                        m->from = offset - pos;
-                        m->count -= m->from;
-                        m->index = index;
-                        break;
-                }
-                pos += m->count;
-                m->count = 0;
-                if (pos == offset) {
-                        index++;
-                        m->index = index;
-                        break;
-                }
-                p = m->op->next(m, p, &index);
-        }
-        m->op->stop(m, p);
-        return error;
-Eoverflow:
-        m->op->stop(m, p);
-        kfree(m->buf);
-        m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
-        return !m->buf ? -ENOMEM : -EAGAIN;
-}
 /**
 *      seq_lseek -     ->llseek() method for sequential files.
 *      @file: the file in question
@@ -389,8 +390,14 @@ char *mangle_path(char *s, char *p, char *esc)
 }
 EXPORT_SYMBOL(mangle_path);
-/*
+/**
- * return the absolute path of 'dentry' residing in mount 'mnt'.
+ * seq_path - seq_file interface to print a pathname
+ * @m: the seq_file handle
+ * @path: the struct path to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path of 'path', as represented by the
+ * dentry / mnt pair in the path parameter.
 */
 int seq_path(struct seq_file *m, struct path *path, char *esc)
 {
@@ -462,7 +469,8 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
        return -1;
 }
-int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+int seq_bitmap(struct seq_file *m, const unsigned long *bits,
+                                   unsigned int nr_bits)
 {
        if (m->count < m->size) {
                int len = bitmap_scnprintf(m->buf + m->count,
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 9c39bc7f8431..b07565c94386 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -205,8 +205,8 @@ static const struct file_operations signalfd_fops = {
        .read           = signalfd_read,
 };
-asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
+SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
-                              size_t sizemask, int flags)
+                size_t, sizemask, int, flags)
 {
        sigset_t sigmask;
        struct signalfd_ctx *ctx;
@@ -259,8 +259,8 @@ asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
        return ufd;
 }
-asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask,
+SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
-                             size_t sizemask)
+                size_t, sizemask)
 {
        return sys_signalfd4(ufd, user_mask, sizemask, 0);
 }
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
new file mode 100644
index 000000000000..e668127c8b2e
--- /dev/null
+++ b/fs/smbfs/Kconfig
@@ -0,0 +1,55 @@
+config SMB_FS
+        tristate "SMB file system support (OBSOLETE, please use CIFS)"
+        depends on INET
+        select NLS
+        help
+          SMB (Server Message Block) is the protocol Windows for Workgroups
+          (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
+          files and printers over local networks.  Saying Y here allows you to
+          mount their file systems (often called "shares" in this context) and
+          access them just like any other Unix directory.  Currently, this
+          works only if the Windows machines use TCP/IP as the underlying
+          transport protocol, and not NetBEUI.  For details, read
+          <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
+          available from <http://www.tldp.org/docs.html#howto>.
+          Note: if you just want your box to act as an SMB *server* and make
+          files and printing services available to Windows clients (which need
+          to have a TCP/IP stack), you don't need to say Y here; you can use
+          the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
+          for that.
+          General information about how to connect Linux, Windows machines and
+          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+          To compile the SMB support as a module, choose M here:
+          the module will be called smbfs.  Most people say N, however.
+config SMB_NLS_DEFAULT
+        bool "Use a default NLS"
+        depends on SMB_FS
+        help
+          Enabling this will make smbfs use nls translations by default. You
+          need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
+          settings and you need to give the default nls for the SMB server as
+          CONFIG_SMB_NLS_REMOTE.
+          The nls settings can be changed at mount time, if your smbmount
+          supports that, using the codepage and iocharset parameters.
+          smbmount from samba 2.2.0 or later supports this.
+config SMB_NLS_REMOTE
+        string "Default Remote NLS Option"
+        depends on SMB_NLS_DEFAULT
+        default "cp437"
+        help
+          This setting allows you to specify a default value for which
+          codepage the server uses. If this field is left blank no
+          translations will be done by default. The local codepage/charset
+          default to CONFIG_NLS_DEFAULT.
+          The nls settings can be changed at mount time, if your smbmount
+          supports that, using the codepage and iocharset parameters.
+          smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index e4f8d51a5553..92d5e8ffb639 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping,
                        struct page **pagep, void **fsdata)
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = __grab_cache_page(mapping, index);
+        *pagep = grab_cache_page_write_begin(mapping, index, flags);
        if (!*pagep)
                return -ENOMEM;
        return 0;
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..4ed0ba44a966 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
+#include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
@@ -1434,8 +1435,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 * Currently we punt and implement it as a normal copy, see pipe_to_user().
 *
 */
-asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
+SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
-                             unsigned long nr_segs, unsigned int flags)
+                unsigned long, nr_segs, unsigned int, flags)
 {
        struct file *file;
        long error;
@@ -1460,9 +1461,9 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
        return error;
 }
-asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
+SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
-                           int fd_out, loff_t __user *off_out,
+                int, fd_out, loff_t __user *, off_out,
-                           size_t len, unsigned int flags)
+                size_t, len, unsigned int, flags)
 {
        long error;
        struct file *in, *out;
@@ -1684,7 +1685,7 @@ static long do_tee(struct file *in, struct file *out, size_t len,
        return ret;
 }
-asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
+SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
 {
        struct file *in;
        int error, fput_in;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
new file mode 100644
index 000000000000..25a00d19d686
--- /dev/null
+++ b/fs/squashfs/Kconfig
@@ -0,0 +1,51 @@
+config SQUASHFS
+        tristate "SquashFS 4.0 - Squashed file system support"
+        depends on BLOCK
+        select ZLIB_INFLATE
+        help
+          Saying Y here includes support for SquashFS 4.0 (a Compressed
+          Read-Only File System).  Squashfs is a highly compressed read-only
+          filesystem for Linux.  It uses zlib compression to compress both
+          files, inodes and directories.  Inodes in the system are very small
+          and all blocks are packed to minimise data overhead. Block sizes
+          greater than 4K are supported up to a maximum of 1 Mbytes (default
+          block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
+          (larger than 4GB), full uid/gid information, hard links and
+          timestamps.  
+          Squashfs is intended for general read-only filesystem use, for
+          archival use (i.e. in cases where a .tar.gz file may be used), and in
+          embedded systems where low overhead is needed.  Further information
+          and tools are available from http://squashfs.sourceforge.net.
+          If you want to compile this as a module ( = code which can be
+          inserted in and removed from the running kernel whenever you want),
+          say M here and read <file:Documentation/modules.txt>.  The module
+          will be called squashfs.  Note that the root file system (the one
+          containing the directory /) cannot be compiled as a module.
+          If unsure, say N.
+config SQUASHFS_EMBEDDED
+        bool "Additional option for memory-constrained systems" 
+        depends on SQUASHFS
+        default n
+        help
+          Saying Y here allows you to specify cache size.
+          If unsure, say N.
+config SQUASHFS_FRAGMENT_CACHE_SIZE
+        int "Number of fragments cached" if SQUASHFS_EMBEDDED
+        depends on SQUASHFS
+        default "3"
+        help
+          By default SquashFS caches the last 3 fragments read from
+          the filesystem.  Increasing this amount may mean SquashFS
+          has to re-read fragments less often from disk, at the expense
+          of extra system memory.  Decreasing this amount will mean
+          SquashFS uses less memory at the expense of extra reads from disk.
+          Note there must be at least one cached fragment.  Anything
+          much more than three will probably not make much difference.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
new file mode 100644
index 000000000000..8258cf9a0317
--- /dev/null
+++ b/fs/squashfs/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the linux squashfs routines.
+#
+obj-$(CONFIG_SQUASHFS) += squashfs.o
+squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
+squashfs-y += namei.o super.o symlink.o
+#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
new file mode 100644
index 000000000000..c837dfc2b3c6
--- /dev/null
+++ b/fs/squashfs/block.c
@@ -0,0 +1,274 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * block.c
+ */
+/*
+ * This file implements the low-level routines to read and decompress
+ * datablocks and metadata blocks.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Read the metadata block length, this is stored in the first two
+ * bytes of the metadata block.
+ */
+static struct buffer_head *get_block_length(struct super_block *sb,
+                        u64 *cur_index, int *offset, int *length)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        struct buffer_head *bh;
+        bh = sb_bread(sb, *cur_index);
+        if (bh == NULL)
+                return NULL;
+        if (msblk->devblksize - *offset == 1) {
+                *length = (unsigned char) bh->b_data[*offset];
+                put_bh(bh);
+                bh = sb_bread(sb, ++(*cur_index));
+                if (bh == NULL)
+                        return NULL;
+                *length |= (unsigned char) bh->b_data[0] << 8;
+                *offset = 1;
+        } else {
+                *length = (unsigned char) bh->b_data[*offset] |
+                        (unsigned char) bh->b_data[*offset + 1] << 8;
+                *offset += 2;
+        }
+        return bh;
+}
+/*
+ * Read and decompress a metadata block or datablock.  Length is non-zero
+ * if a datablock is being read (the size is stored elsewhere in the
+ * filesystem), otherwise the length is obtained from the first two bytes of
+ * the metadata block.  A bit in the length field indicates if the block
+ * is stored uncompressed in the filesystem (usually because compression
+ * generated a larger block - this does occasionally happen with zlib).
+ */
+int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
+                        int length, u64 *next_index, int srclength)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        struct buffer_head **bh;
+        int offset = index & ((1 << msblk->devblksize_log2) - 1);
+        u64 cur_index = index >> msblk->devblksize_log2;
+        int bytes, compressed, b = 0, k = 0, page = 0, avail;
+        bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
+                                sizeof(*bh), GFP_KERNEL);
+        if (bh == NULL)
+                return -ENOMEM;
+        if (length) {
+                /*
+                 * Datablock.
+                 */
+                bytes = -offset;
+                compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+                length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+                if (next_index)
+                        *next_index = index + length;
+                TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
+                        index, compressed ? "" : "un", length, srclength);
+                if (length < 0 || length > srclength ||
+                                (index + length) > msblk->bytes_used)
+                        goto read_failure;
+                for (b = 0; bytes < length; b++, cur_index++) {
+                        bh[b] = sb_getblk(sb, cur_index);
+                        if (bh[b] == NULL)
+                                goto block_release;
+                        bytes += msblk->devblksize;
+                }
+                ll_rw_block(READ, b, bh);
+        } else {
+                /*
+                 * Metadata block.
+                 */
+                if ((index + 2) > msblk->bytes_used)
+                        goto read_failure;
+                bh[0] = get_block_length(sb, &cur_index, &offset, &length);
+                if (bh[0] == NULL)
+                        goto read_failure;
+                b = 1;
+                bytes = msblk->devblksize - offset;
+                compressed = SQUASHFS_COMPRESSED(length);
+                length = SQUASHFS_COMPRESSED_SIZE(length);
+                if (next_index)
+                        *next_index = index + length + 2;
+                TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+                                compressed ? "" : "un", length);
+                if (length < 0 || length > srclength ||
+                                        (index + length) > msblk->bytes_used)
+                        goto block_release;
+                for (; bytes < length; b++) {
+                        bh[b] = sb_getblk(sb, ++cur_index);
+                        if (bh[b] == NULL)
+                                goto block_release;
+                        bytes += msblk->devblksize;
+                }
+                ll_rw_block(READ, b - 1, bh + 1);
+        }
+        if (compressed) {
+                int zlib_err = 0, zlib_init = 0;
+                /*
+                 * Uncompress block.
+                 */
+                mutex_lock(&msblk->read_data_mutex);
+                msblk->stream.avail_out = 0;
+                msblk->stream.avail_in = 0;
+                bytes = length;
+                do {
+                        if (msblk->stream.avail_in == 0 && k < b) {
+                                avail = min(bytes, msblk->devblksize - offset);
+                                bytes -= avail;
+                                wait_on_buffer(bh[k]);
+                                if (!buffer_uptodate(bh[k]))
+                                        goto release_mutex;
+                                if (avail == 0) {
+                                        offset = 0;
+                                        put_bh(bh[k++]);
+                                        continue;
+                                }
+                                msblk->stream.next_in = bh[k]->b_data + offset;
+                                msblk->stream.avail_in = avail;
+                                offset = 0;
+                        }
+                        if (msblk->stream.avail_out == 0) {
+                                msblk->stream.next_out = buffer[page++];
+                                msblk->stream.avail_out = PAGE_CACHE_SIZE;
+                        }
+                        if (!zlib_init) {
+                                zlib_err = zlib_inflateInit(&msblk->stream);
+                                if (zlib_err != Z_OK) {
+                                        ERROR("zlib_inflateInit returned"
+                                                " unexpected result 0x%x,"
+                                                " srclength %d\n", zlib_err,
+                                                srclength);
+                                        goto release_mutex;
+                                }
+                                zlib_init = 1;
+                        }
+                        zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
+                        if (msblk->stream.avail_in == 0 && k < b)
+                                put_bh(bh[k++]);
+                } while (zlib_err == Z_OK);
+                if (zlib_err != Z_STREAM_END) {
+                        ERROR("zlib_inflate returned unexpected result"
+                                " 0x%x, srclength %d, avail_in %d,"
+                                " avail_out %d\n", zlib_err, srclength,
+                                msblk->stream.avail_in,
+                                msblk->stream.avail_out);
+                        goto release_mutex;
+                }
+                zlib_err = zlib_inflateEnd(&msblk->stream);
+                if (zlib_err != Z_OK) {
+                        ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
+                                " srclength %d\n", zlib_err, srclength);
+                        goto release_mutex;
+                }
+                length = msblk->stream.total_out;
+                mutex_unlock(&msblk->read_data_mutex);
+        } else {
+                /*
+                 * Block is uncompressed.
+                 */
+                int i, in, pg_offset = 0;
+                for (i = 0; i < b; i++) {
+                        wait_on_buffer(bh[i]);
+                        if (!buffer_uptodate(bh[i]))
+                                goto block_release;
+                }
+                for (bytes = length; k < b; k++) {
+                        in = min(bytes, msblk->devblksize - offset);
+                        bytes -= in;
+                        while (in) {
+                                if (pg_offset == PAGE_CACHE_SIZE) {
+                                        page++;
+                                        pg_offset = 0;
+                                }
+                                avail = min_t(int, in, PAGE_CACHE_SIZE -
+                                                pg_offset);
+                                memcpy(buffer[page] + pg_offset,
+                                                bh[k]->b_data + offset, avail);
+                                in -= avail;
+                                pg_offset += avail;
+                                offset += avail;
+                        }
+                        offset = 0;
+                        put_bh(bh[k]);
+                }
+        }
+        kfree(bh);
+        return length;
+release_mutex:
+        mutex_unlock(&msblk->read_data_mutex);
+block_release:
+        for (; k < b; k++)
+                put_bh(bh[k]);
+read_failure:
+        ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+        kfree(bh);
+        return -EIO;
+}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 000000000000..f29eda16d25e
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,412 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * cache.c
+ */
+/*
+ * Blocks in Squashfs are compressed.  To avoid repeatedly decompressing
+ * recently accessed data Squashfs uses two small metadata and fragment caches.
+ *
+ * This file implements a generic cache implementation used for both caches,
+ * plus functions layered ontop of the generic cache implementation to
+ * access the metadata and fragment caches.
+ *
+ * To avoid out of memory and fragmentation isssues with vmalloc the cache
+ * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
+ *
+ * It should be noted that the cache is not used for file datablocks, these
+ * are decompressed and cached in the page-cache in the normal way.  The
+ * cache is only used to temporarily cache fragment and metadata blocks
+ * which have been read as as a result of a metadata (i.e. inode or
+ * directory) or fragment access.  Because metadata and fragments are packed
+ * together into blocks (to gain greater compression) the read of a particular
+ * piece of metadata or fragment will retrieve other metadata/fragments which
+ * have been packed with it, these because of locality-of-reference may be read
+ * in the near future. Temporarily caching them ensures they are available for
+ * near future access without requiring an additional read and decompress.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/zlib.h>
+#include <linux/pagemap.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Look-up block in cache, and increment usage count.  If not in cache, read
+ * and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
+        struct squashfs_cache *cache, u64 block, int length)
+{
+        int i, n;
+        struct squashfs_cache_entry *entry;
+        spin_lock(&cache->lock);
+        while (1) {
+                for (i = 0; i < cache->entries; i++)
+                        if (cache->entry[i].block == block)
+                                break;
+                if (i == cache->entries) {
+                        /*
+                         * Block not in cache, if all cache entries are used
+                         * go to sleep waiting for one to become available.
+                         */
+                        if (cache->unused == 0) {
+                                cache->num_waiters++;
+                                spin_unlock(&cache->lock);
+                                wait_event(cache->wait_queue, cache->unused);
+                                spin_lock(&cache->lock);
+                                cache->num_waiters--;
+                                continue;
+                        }
+                        /*
+                         * At least one unused cache entry.  A simple
+                         * round-robin strategy is used to choose the entry to
+                         * be evicted from the cache.
+                         */
+                        i = cache->next_blk;
+                        for (n = 0; n < cache->entries; n++) {
+                                if (cache->entry[i].refcount == 0)
+                                        break;
+                                i = (i + 1) % cache->entries;
+                        }
+                        cache->next_blk = (i + 1) % cache->entries;
+                        entry = &cache->entry[i];
+                        /*
+                         * Initialise choosen cache entry, and fill it in from
+                         * disk.
+                         */
+                        cache->unused--;
+                        entry->block = block;
+                        entry->refcount = 1;
+                        entry->pending = 1;
+                        entry->num_waiters = 0;
+                        entry->error = 0;
+                        spin_unlock(&cache->lock);
+                        entry->length = squashfs_read_data(sb, entry->data,
+                                block, length, &entry->next_index,
+                                cache->block_size);
+                        spin_lock(&cache->lock);
+                        if (entry->length < 0)
+                                entry->error = entry->length;
+                        entry->pending = 0;
+                        /*
+                         * While filling this entry one or more other processes
+                         * have looked it up in the cache, and have slept
+                         * waiting for it to become available.
+                         */
+                        if (entry->num_waiters) {
+                                spin_unlock(&cache->lock);
+                                wake_up_all(&entry->wait_queue);
+                        } else
+                                spin_unlock(&cache->lock);
+                        goto out;
+                }
+                /*
+                 * Block already in cache.  Increment refcount so it doesn't
+                 * get reused until we're finished with it, if it was
+                 * previously unused there's one less cache entry available
+                 * for reuse.
+                 */
+                entry = &cache->entry[i];
+                if (entry->refcount == 0)
+                        cache->unused--;
+                entry->refcount++;
+                /*
+                 * If the entry is currently being filled in by another process
+                 * go to sleep waiting for it to become available.
+                 */
+                if (entry->pending) {
+                        entry->num_waiters++;
+                        spin_unlock(&cache->lock);
+                        wait_event(entry->wait_queue, !entry->pending);
+                } else
+                        spin_unlock(&cache->lock);
+                goto out;
+        }
+out:
+        TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
+                cache->name, i, entry->block, entry->refcount, entry->error);
+        if (entry->error)
+                ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
+                                                        block);
+        return entry;
+}
+/*
+ * Release cache entry, once usage count is zero it can be reused.
+ */
+void squashfs_cache_put(struct squashfs_cache_entry *entry)
+{
+        struct squashfs_cache *cache = entry->cache;
+        spin_lock(&cache->lock);
+        entry->refcount--;
+        if (entry->refcount == 0) {
+                cache->unused++;
+                /*
+                 * If there's any processes waiting for a block to become
+                 * available, wake one up.
+                 */
+                if (cache->num_waiters) {
+                        spin_unlock(&cache->lock);
+                        wake_up(&cache->wait_queue);
+                        return;
+                }
+        }
+        spin_unlock(&cache->lock);
+}
+/*
+ * Delete cache reclaiming all kmalloced buffers.
+ */
+void squashfs_cache_delete(struct squashfs_cache *cache)
+{
+        int i, j;
+        if (cache == NULL)
+                return;
+        for (i = 0; i < cache->entries; i++) {
+                if (cache->entry[i].data) {
+                        for (j = 0; j < cache->pages; j++)
+                                kfree(cache->entry[i].data[j]);
+                        kfree(cache->entry[i].data);
+                }
+        }
+        kfree(cache->entry);
+        kfree(cache);
+}
+/*
+ * Initialise cache allocating the specified number of entries, each of
+ * size block_size.  To avoid vmalloc fragmentation issues each entry
+ * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
+ */
+struct squashfs_cache *squashfs_cache_init(char *name, int entries,
+        int block_size)
+{
+        int i, j;
+        struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+        if (cache == NULL) {
+                ERROR("Failed to allocate %s cache\n", name);
+                return NULL;
+        }
+        cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
+        if (cache->entry == NULL) {
+                ERROR("Failed to allocate %s cache\n", name);
+                goto cleanup;
+        }
+        cache->next_blk = 0;
+        cache->unused = entries;
+        cache->entries = entries;
+        cache->block_size = block_size;
+        cache->pages = block_size >> PAGE_CACHE_SHIFT;
+        cache->name = name;
+        cache->num_waiters = 0;
+        spin_lock_init(&cache->lock);
+        init_waitqueue_head(&cache->wait_queue);
+        for (i = 0; i < entries; i++) {
+                struct squashfs_cache_entry *entry = &cache->entry[i];
+                init_waitqueue_head(&cache->entry[i].wait_queue);
+                entry->cache = cache;
+                entry->block = SQUASHFS_INVALID_BLK;
+                entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
+                if (entry->data == NULL) {
+                        ERROR("Failed to allocate %s cache entry\n", name);
+                        goto cleanup;
+                }
+                for (j = 0; j < cache->pages; j++) {
+                        entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+                        if (entry->data[j] == NULL) {
+                                ERROR("Failed to allocate %s buffer\n", name);
+                                goto cleanup;
+                        }
+                }
+        }
+        return cache;
+cleanup:
+        squashfs_cache_delete(cache);
+        return NULL;
+}
+/*
+ * Copy upto length bytes from cache entry to buffer starting at offset bytes
+ * into the cache entry.  If there's not length bytes then copy the number of
+ * bytes available.  In all cases return the number of bytes copied.
+ */
+int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
+                int offset, int length)
+{
+        int remaining = length;
+        if (length == 0)
+                return 0;
+        else if (buffer == NULL)
+                return min(length, entry->length - offset);
+        while (offset < entry->length) {
+                void *buff = entry->data[offset / PAGE_CACHE_SIZE]
+                                + (offset % PAGE_CACHE_SIZE);
+                int bytes = min_t(int, entry->length - offset,
+                                PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+                if (bytes >= remaining) {
+                        memcpy(buffer, buff, remaining);
+                        remaining = 0;
+                        break;
+                }
+                memcpy(buffer, buff, bytes);
+                buffer += bytes;
+                remaining -= bytes;
+                offset += bytes;
+        }
+        return length - remaining;
+}
+/*
+ * Read length bytes from metadata position <block, offset> (block is the
+ * start of the compressed block on disk, and offset is the offset into
+ * the block once decompressed).  Data is packed into consecutive blocks,
+ * and length bytes may require reading more than one block.
+ */
+int squashfs_read_metadata(struct super_block *sb, void *buffer,
+                u64 *block, int *offset, int length)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int bytes, copied = length;
+        struct squashfs_cache_entry *entry;
+        TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
+        while (length) {
+                entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
+                if (entry->error)
+                        return entry->error;
+                else if (*offset >= entry->length)
+                        return -EIO;
+                bytes = squashfs_copy_data(buffer, entry, *offset, length);
+                if (buffer)
+                        buffer += bytes;
+                length -= bytes;
+                *offset += bytes;
+                if (*offset == entry->length) {
+                        *block = entry->next_index;
+                        *offset = 0;
+                }
+                squashfs_cache_put(entry);
+        }
+        return copied;
+}
+/*
+ * Look-up in the fragmment cache the fragment located at <start_block> in the
+ * filesystem.  If necessary read and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
+                                u64 start_block, int length)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
+                length);
+}
+/*
+ * Read and decompress the datablock located at <start_block> in the
+ * filesystem.  The cache is used here to avoid duplicating locking and
+ * read/decompress code.
+ */
+struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
+                                u64 start_block, int length)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        return squashfs_cache_get(sb, msblk->read_page, start_block, length);
+}
+/*
+ * Read a filesystem table (uncompressed sequence of bytes) from disk
+ */
+int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
+        int length)
+{
+        int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        int i, res;
+        void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+        if (data == NULL)
+                return -ENOMEM;
+        for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+                data[i] = buffer;
+        res = squashfs_read_data(sb, data, block, length |
+                SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
+        kfree(data);
+        return res;
+}
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
new file mode 100644
index 000000000000..566b0eaed868
--- /dev/null
+++ b/fs/squashfs/dir.c
@@ -0,0 +1,235 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * dir.c
+ */
+/*
+ * This file implements code to read directories from disk.
+ *
+ * See namei.c for a description of directory organisation on disk.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static const unsigned char squashfs_filetype_table[] = {
+        DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK
+};
+/*
+ * Lookup offset (f_pos) in the directory index, returning the
+ * metadata block containing it.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_offset(struct super_block *sb,
+        u64 *next_block, int *next_offset, u64 index_start, int index_offset,
+        int i_count, u64 f_pos)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int err, i, index, length = 0;
+        struct squashfs_dir_index dir_index;
+        TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
+                                        i_count, f_pos);
+        /*
+         * Translate from external f_pos to the internal f_pos.  This
+         * is offset by 3 because we invent "." and ".." entries which are
+         * not actually stored in the directory.
+         */
+        if (f_pos < 3)
+                return f_pos;
+        f_pos -= 3;
+        for (i = 0; i < i_count; i++) {
+                err = squashfs_read_metadata(sb, &dir_index, &index_start,
+                                &index_offset, sizeof(dir_index));
+                if (err < 0)
+                        break;
+                index = le32_to_cpu(dir_index.index);
+                if (index > f_pos)
+                        /*
+                         * Found the index we're looking for.
+                         */
+                        break;
+                err = squashfs_read_metadata(sb, NULL, &index_start,
+                                &index_offset, le32_to_cpu(dir_index.size) + 1);
+                if (err < 0)
+                        break;
+                length = index;
+                *next_block = le32_to_cpu(dir_index.start_block) +
+                                        msblk->directory_table;
+        }
+        *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+        /*
+         * Translate back from internal f_pos to external f_pos.
+         */
+        return length + 3;
+}
+static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        u64 block = squashfs_i(inode)->start + msblk->directory_table;
+        int offset = squashfs_i(inode)->offset, length = 0, dir_count, size,
+                                type, err;
+        unsigned int inode_number;
+        struct squashfs_dir_header dirh;
+        struct squashfs_dir_entry *dire;
+        TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset);
+        dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+        if (dire == NULL) {
+                ERROR("Failed to allocate squashfs_dir_entry\n");
+                goto finish;
+        }
+        /*
+         * Return "." and  ".." entries as the first two filenames in the
+         * directory.  To maximise compression these two entries are not
+         * stored in the directory, and so we invent them here.
+         *
+         * It also means that the external f_pos is offset by 3 from the
+         * on-disk directory f_pos.
+         */
+        while (file->f_pos < 3) {
+                char *name;
+                int i_ino;
+                if (file->f_pos == 0) {
+                        name = ".";
+                        size = 1;
+                        i_ino = inode->i_ino;
+                } else {
+                        name = "..";
+                        size = 2;
+                        i_ino = squashfs_i(inode)->parent;
+                }
+                TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
+                                dirent, name, size, file->f_pos, i_ino,
+                                squashfs_filetype_table[1]);
+                if (filldir(dirent, name, size, file->f_pos, i_ino,
+                                squashfs_filetype_table[1]) < 0) {
+                                TRACE("Filldir returned less than 0\n");
+                        goto finish;
+                }
+                file->f_pos += size;
+        }
+        length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
+                                squashfs_i(inode)->dir_idx_start,
+                                squashfs_i(inode)->dir_idx_offset,
+                                squashfs_i(inode)->dir_idx_cnt,
+                                file->f_pos);
+        while (length < i_size_read(inode)) {
+                /*
+                 * Read directory header
+                 */
+                err = squashfs_read_metadata(inode->i_sb, &dirh, &block,
+                                        &offset, sizeof(dirh));
+                if (err < 0)
+                        goto failed_read;
+                length += sizeof(dirh);
+                dir_count = le32_to_cpu(dirh.count) + 1;
+                while (dir_count--) {
+                        /*
+                         * Read directory entry.
+                         */
+                        err = squashfs_read_metadata(inode->i_sb, dire, &block,
+                                        &offset, sizeof(*dire));
+                        if (err < 0)
+                                goto failed_read;
+                        size = le16_to_cpu(dire->size) + 1;
+                        err = squashfs_read_metadata(inode->i_sb, dire->name,
+                                        &block, &offset, size);
+                        if (err < 0)
+                                goto failed_read;
+                        length += sizeof(*dire) + size;
+                        if (file->f_pos >= length)
+                                continue;
+                        dire->name[size] = '\0';
+                        inode_number = le32_to_cpu(dirh.inode_number) +
+                                ((short) le16_to_cpu(dire->inode_number));
+                        type = le16_to_cpu(dire->type);
+                        TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
+                                        "\n", dirent, dire->name, size,
+                                        file->f_pos,
+                                        le32_to_cpu(dirh.start_block),
+                                        le16_to_cpu(dire->offset),
+                                        inode_number,
+                                        squashfs_filetype_table[type]);
+                        if (filldir(dirent, dire->name, size, file->f_pos,
+                                        inode_number,
+                                        squashfs_filetype_table[type]) < 0) {
+                                TRACE("Filldir returned less than 0\n");
+                                goto finish;
+                        }
+                        file->f_pos = length;
+                }
+        }
+finish:
+        kfree(dire);
+        return 0;
+failed_read:
+        ERROR("Unable to read directory block [%llx:%x]\n", block, offset);
+        kfree(dire);
+        return 0;
+}
+const struct file_operations squashfs_dir_ops = {
+        .read = generic_read_dir,
+        .readdir = squashfs_readdir
+};
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
new file mode 100644
index 000000000000..69e971d5ddc1
--- /dev/null
+++ b/fs/squashfs/export.c
@@ -0,0 +1,155 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * export.c
+ */
+/*
+ * This file implements code to make Squashfs filesystems exportable (NFS etc.)
+ *
+ * The export code uses an inode lookup table to map inode numbers passed in
+ * filehandles to an inode location on disk.  This table is stored compressed
+ * into metadata blocks.  A second index table is used to locate these.  This
+ * second index table for speed of access (and because it is small) is read at
+ * mount time and cached in memory.
+ *
+ * The inode lookup table is used only by the export code, inode disk
+ * locations are directly encoded in directories, enabling direct access
+ * without an intermediate lookup for all operations except the export ops.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Look-up inode number (ino) in table, returning the inode location.
+ */
+static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
+        int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
+        u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+        __le64 ino;
+        int err;
+        TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
+        err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
+        if (err < 0)
+                return err;
+        TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
+                (u64) le64_to_cpu(ino));
+        return le64_to_cpu(ino);
+}
+static struct dentry *squashfs_export_iget(struct super_block *sb,
+        unsigned int ino_num)
+{
+        long long ino;
+        struct dentry *dentry = ERR_PTR(-ENOENT);
+        TRACE("Entered squashfs_export_iget\n");
+        ino = squashfs_inode_lookup(sb, ino_num);
+        if (ino >= 0)
+                dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));
+        return dentry;
+}
+static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
+                struct fid *fid, int fh_len, int fh_type)
+{
+        if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
+                        || fh_len < 2)
+                return NULL;
+        return squashfs_export_iget(sb, fid->i32.ino);
+}
+static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
+                struct fid *fid, int fh_len, int fh_type)
+{
+        if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
+                return NULL;
+        return squashfs_export_iget(sb, fid->i32.parent_ino);
+}
+static struct dentry *squashfs_get_parent(struct dentry *child)
+{
+        struct inode *inode = child->d_inode;
+        unsigned int parent_ino = squashfs_i(inode)->parent;
+        return squashfs_export_iget(inode->i_sb, parent_ino);
+}
+/*
+ * Read uncompressed inode lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
+                u64 lookup_table_start, unsigned int inodes)
+{
+        unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
+        __le64 *inode_lookup_table;
+        int err;
+        TRACE("In read_inode_lookup_table, length %d\n", length);
+        /* Allocate inode lookup table indexes */
+        inode_lookup_table = kmalloc(length, GFP_KERNEL);
+        if (inode_lookup_table == NULL) {
+                ERROR("Failed to allocate inode lookup table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start,
+                        length);
+        if (err < 0) {
+                ERROR("unable to read inode lookup table\n");
+                kfree(inode_lookup_table);
+                return ERR_PTR(err);
+        }
+        return inode_lookup_table;
+}
+const struct export_operations squashfs_export_ops = {
+        .fh_to_dentry = squashfs_fh_to_dentry,
+        .fh_to_parent = squashfs_fh_to_parent,
+        .get_parent = squashfs_get_parent
+};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
new file mode 100644
index 000000000000..717767d831df
--- /dev/null
+++ b/fs/squashfs/file.c
@@ -0,0 +1,502 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * file.c
+ */
+/*
+ * This file contains code for handling regular files.  A regular file
+ * consists of a sequence of contiguous compressed blocks, and/or a
+ * compressed fragment block (tail-end packed block).   The compressed size
+ * of each datablock is stored in a block list contained within the
+ * file inode (itself stored in one or more compressed metadata blocks).
+ *
+ * To speed up access to datablocks when reading 'large' files (256 Mbytes or
+ * larger), the code implements an index cache that caches the mapping from
+ * block index to datablock location on disk.
+ *
+ * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
+ * retaining a simple and space-efficient block list on disk.  The cache
+ * is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
+ * Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
+ * The index cache is designed to be memory efficient, and by default uses
+ * 16 KiB.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Locate cache slot in range [offset, index] for specified inode.  If
+ * there's more than one return the slot closest to index.
+ */
+static struct meta_index *locate_meta_index(struct inode *inode, int offset,
+                                int index)
+{
+        struct meta_index *meta = NULL;
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        int i;
+        mutex_lock(&msblk->meta_index_mutex);
+        TRACE("locate_meta_index: index %d, offset %d\n", index, offset);
+        if (msblk->meta_index == NULL)
+                goto not_allocated;
+        for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+                if (msblk->meta_index[i].inode_number == inode->i_ino &&
+                                msblk->meta_index[i].offset >= offset &&
+                                msblk->meta_index[i].offset <= index &&
+                                msblk->meta_index[i].locked == 0) {
+                        TRACE("locate_meta_index: entry %d, offset %d\n", i,
+                                        msblk->meta_index[i].offset);
+                        meta = &msblk->meta_index[i];
+                        offset = meta->offset;
+                }
+        }
+        if (meta)
+                meta->locked = 1;
+not_allocated:
+        mutex_unlock(&msblk->meta_index_mutex);
+        return meta;
+}
+/*
+ * Find and initialise an empty cache slot for index offset.
+ */
+static struct meta_index *empty_meta_index(struct inode *inode, int offset,
+                                int skip)
+{
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        struct meta_index *meta = NULL;
+        int i;
+        mutex_lock(&msblk->meta_index_mutex);
+        TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip);
+        if (msblk->meta_index == NULL) {
+                /*
+                 * First time cache index has been used, allocate and
+                 * initialise.  The cache index could be allocated at
+                 * mount time but doing it here means it is allocated only
+                 * if a 'large' file is read.
+                 */
+                msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS,
+                        sizeof(*(msblk->meta_index)), GFP_KERNEL);
+                if (msblk->meta_index == NULL) {
+                        ERROR("Failed to allocate meta_index\n");
+                        goto failed;
+                }
+                for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+                        msblk->meta_index[i].inode_number = 0;
+                        msblk->meta_index[i].locked = 0;
+                }
+                msblk->next_meta_index = 0;
+        }
+        for (i = SQUASHFS_META_SLOTS; i &&
+                        msblk->meta_index[msblk->next_meta_index].locked; i--)
+                msblk->next_meta_index = (msblk->next_meta_index + 1) %
+                        SQUASHFS_META_SLOTS;
+        if (i == 0) {
+                TRACE("empty_meta_index: failed!\n");
+                goto failed;
+        }
+        TRACE("empty_meta_index: returned meta entry %d, %p\n",
+                        msblk->next_meta_index,
+                        &msblk->meta_index[msblk->next_meta_index]);
+        meta = &msblk->meta_index[msblk->next_meta_index];
+        msblk->next_meta_index = (msblk->next_meta_index + 1) %
+                        SQUASHFS_META_SLOTS;
+        meta->inode_number = inode->i_ino;
+        meta->offset = offset;
+        meta->skip = skip;
+        meta->entries = 0;
+        meta->locked = 1;
+failed:
+        mutex_unlock(&msblk->meta_index_mutex);
+        return meta;
+}
+static void release_meta_index(struct inode *inode, struct meta_index *meta)
+{
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        mutex_lock(&msblk->meta_index_mutex);
+        meta->locked = 0;
+        mutex_unlock(&msblk->meta_index_mutex);
+}
+/*
+ * Read the next n blocks from the block list, starting from
+ * metadata block <start_block, offset>.
+ */
+static long long read_indexes(struct super_block *sb, int n,
+                                u64 *start_block, int *offset)
+{
+        int err, i;
+        long long block = 0;
+        __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+        if (blist == NULL) {
+                ERROR("read_indexes: Failed to allocate block_list\n");
+                return -ENOMEM;
+        }
+        while (n) {
+                int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
+                err = squashfs_read_metadata(sb, blist, start_block,
+                                offset, blocks << 2);
+                if (err < 0) {
+                        ERROR("read_indexes: reading block [%llx:%x]\n",
+                                *start_block, *offset);
+                        goto failure;
+                }
+                for (i = 0; i < blocks; i++) {
+                        int size = le32_to_cpu(blist[i]);
+                        block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size);
+                }
+                n -= blocks;
+        }
+        kfree(blist);
+        return block;
+failure:
+        kfree(blist);
+        return err;
+}
+/*
+ * Each cache index slot has SQUASHFS_META_ENTRIES, each of which
+ * can cache one index -> datablock/blocklist-block mapping.  We wish
+ * to distribute these over the length of the file, entry[0] maps index x,
+ * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on.
+ * The larger the file, the greater the skip factor.  The skip factor is
+ * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure
+ * the number of metadata blocks that need to be read fits into the cache.
+ * If the skip factor is limited in this way then the file will use multiple
+ * slots.
+ */
+static inline int calculate_skip(int blocks)
+{
+        int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
+                 * SQUASHFS_META_INDEXES);
+        return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
+}
+/*
+ * Search and grow the index cache for the specified inode, returning the
+ * on-disk locations of the datablock and block list metadata block
+ * <index_block, index_offset> for index (scaled to nearest cache index).
+ */
+static int fill_meta_index(struct inode *inode, int index,
+                u64 *index_block, int *index_offset, u64 *data_block)
+{
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        int skip = calculate_skip(i_size_read(inode) >> msblk->block_log);
+        int offset = 0;
+        struct meta_index *meta;
+        struct meta_entry *meta_entry;
+        u64 cur_index_block = squashfs_i(inode)->block_list_start;
+        int cur_offset = squashfs_i(inode)->offset;
+        u64 cur_data_block = squashfs_i(inode)->start;
+        int err, i;
+        /*
+         * Scale index to cache index (cache slot entry)
+         */
+        index /= SQUASHFS_META_INDEXES * skip;
+        while (offset < index) {
+                meta = locate_meta_index(inode, offset + 1, index);
+                if (meta == NULL) {
+                        meta = empty_meta_index(inode, offset + 1, skip);
+                        if (meta == NULL)
+                                goto all_done;
+                } else {
+                        offset = index < meta->offset + meta->entries ? index :
+                                meta->offset + meta->entries - 1;
+                        meta_entry = &meta->meta_entry[offset - meta->offset];
+                        cur_index_block = meta_entry->index_block +
+                                msblk->inode_table;
+                        cur_offset = meta_entry->offset;
+                        cur_data_block = meta_entry->data_block;
+                        TRACE("get_meta_index: offset %d, meta->offset %d, "
+                                "meta->entries %d\n", offset, meta->offset,
+                                meta->entries);
+                        TRACE("get_meta_index: index_block 0x%llx, offset 0x%x"
+                                " data_block 0x%llx\n", cur_index_block,
+                                cur_offset, cur_data_block);
+                }
+                /*
+                 * If necessary grow cache slot by reading block list.  Cache
+                 * slot is extended up to index or to the end of the slot, in
+                 * which case further slots will be used.
+                 */
+                for (i = meta->offset + meta->entries; i <= index &&
+                                i < meta->offset + SQUASHFS_META_ENTRIES; i++) {
+                        int blocks = skip * SQUASHFS_META_INDEXES;
+                        long long res = read_indexes(inode->i_sb, blocks,
+                                        &cur_index_block, &cur_offset);
+                        if (res < 0) {
+                                if (meta->entries == 0)
+                                        /*
+                                         * Don't leave an empty slot on read
+                                         * error allocated to this inode...
+                                         */
+                                        meta->inode_number = 0;
+                                err = res;
+                                goto failed;
+                        }
+                        cur_data_block += res;
+                        meta_entry = &meta->meta_entry[i - meta->offset];
+                        meta_entry->index_block = cur_index_block -
+                                msblk->inode_table;
+                        meta_entry->offset = cur_offset;
+                        meta_entry->data_block = cur_data_block;
+                        meta->entries++;
+                        offset++;
+                }
+                TRACE("get_meta_index: meta->offset %d, meta->entries %d\n",
+                                meta->offset, meta->entries);
+                release_meta_index(inode, meta);
+        }
+all_done:
+        *index_block = cur_index_block;
+        *index_offset = cur_offset;
+        *data_block = cur_data_block;
+        /*
+         * Scale cache index (cache slot entry) to index
+         */
+        return offset * SQUASHFS_META_INDEXES * skip;
+failed:
+        release_meta_index(inode, meta);
+        return err;
+}
+/*
+ * Get the on-disk location and compressed size of the datablock
+ * specified by index.  Fill_meta_index() does most of the work.
+ */
+static int read_blocklist(struct inode *inode, int index, u64 *block)
+{
+        u64 start;
+        long long blks;
+        int offset;
+        __le32 size;
+        int res = fill_meta_index(inode, index, &start, &offset, block);
+        TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
+                       " 0x%x, block 0x%llx\n", res, index, start, offset,
+                        *block);
+        if (res < 0)
+                return res;
+        /*
+         * res contains the index of the mapping returned by fill_meta_index(),
+         * this will likely be less than the desired index (because the
+         * meta_index cache works at a higher granularity).  Read any
+         * extra block indexes needed.
+         */
+        if (res < index) {
+                blks = read_indexes(inode->i_sb, index - res, &start, &offset);
+                if (blks < 0)
+                        return (int) blks;
+                *block += blks;
+        }
+        /*
+         * Read length of block specified by index.
+         */
+        res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
+                        sizeof(size));
+        if (res < 0)
+                return res;
+        return le32_to_cpu(size);
+}
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        int bytes, i, offset = 0, sparse = 0;
+        struct squashfs_cache_entry *buffer = NULL;
+        void *pageaddr;
+        int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+        int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+        int start_index = page->index & ~mask;
+        int end_index = start_index | mask;
+        int file_end = i_size_read(inode) >> msblk->block_log;
+        TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+                                page->index, squashfs_i(inode)->start);
+        if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                                        PAGE_CACHE_SHIFT))
+                goto out;
+        if (index < file_end || squashfs_i(inode)->fragment_block ==
+                                        SQUASHFS_INVALID_BLK) {
+                /*
+                 * Reading a datablock from disk.  Need to read block list
+                 * to get location and block size.
+                 */
+                u64 block = 0;
+                int bsize = read_blocklist(inode, index, &block);
+                if (bsize < 0)
+                        goto error_out;
+                if (bsize == 0) { /* hole */
+                        bytes = index == file_end ?
+                                (i_size_read(inode) & (msblk->block_size - 1)) :
+                                 msblk->block_size;
+                        sparse = 1;
+                } else {
+                        /*
+                         * Read and decompress datablock.
+                         */
+                        buffer = squashfs_get_datablock(inode->i_sb,
+                                                                block, bsize);
+                        if (buffer->error) {
+                                ERROR("Unable to read page, block %llx, size %x"
+                                        "\n", block, bsize);
+                                squashfs_cache_put(buffer);
+                                goto error_out;
+                        }
+                        bytes = buffer->length;
+                }
+        } else {
+                /*
+                 * Datablock is stored inside a fragment (tail-end packed
+                 * block).
+                 */
+                buffer = squashfs_get_fragment(inode->i_sb,
+                                squashfs_i(inode)->fragment_block,
+                                squashfs_i(inode)->fragment_size);
+                if (buffer->error) {
+                        ERROR("Unable to read page, block %llx, size %x\n",
+                                squashfs_i(inode)->fragment_block,
+                                squashfs_i(inode)->fragment_size);
+                        squashfs_cache_put(buffer);
+                        goto error_out;
+                }
+                bytes = i_size_read(inode) & (msblk->block_size - 1);
+                offset = squashfs_i(inode)->fragment_offset;
+        }
+        /*
+         * Loop copying datablock into pages.  As the datablock likely covers
+         * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
+         * grab the pages from the page cache, except for the page that we've
+         * been called to fill.
+         */
+        for (i = start_index; i <= end_index && bytes > 0; i++,
+                        bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+                struct page *push_page;
+                int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
+                TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
+                push_page = (i == page->index) ? page :
+                        grab_cache_page_nowait(page->mapping, i);
+                if (!push_page)
+                        continue;
+                if (PageUptodate(push_page))
+                        goto skip_page;
+                pageaddr = kmap_atomic(push_page, KM_USER0);
+                squashfs_copy_data(pageaddr, buffer, offset, avail);
+                memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+                kunmap_atomic(pageaddr, KM_USER0);
+                flush_dcache_page(push_page);
+                SetPageUptodate(push_page);
+skip_page:
+                unlock_page(push_page);
+                if (i != page->index)
+                        page_cache_release(push_page);
+        }
+        if (!sparse)
+                squashfs_cache_put(buffer);
+        return 0;
+error_out:
+        SetPageError(page);
+out:
+        pageaddr = kmap_atomic(page, KM_USER0);
+        memset(pageaddr, 0, PAGE_CACHE_SIZE);
+        kunmap_atomic(pageaddr, KM_USER0);
+        flush_dcache_page(page);
+        if (!PageError(page))
+                SetPageUptodate(page);
+        unlock_page(page);
+        return 0;
+}
+const struct address_space_operations squashfs_aops = {
+        .readpage = squashfs_readpage
+};
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
new file mode 100644
index 000000000000..b5a2c15bbbc7
--- /dev/null
+++ b/fs/squashfs/fragment.c
@@ -0,0 +1,98 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * fragment.c
+ */
+/*
+ * This file implements code to handle compressed fragments (tail-end packed
+ * datablocks).
+ *
+ * Regular files contain a fragment index which is mapped to a fragment
+ * location on disk and compressed size using a fragment lookup table.
+ * Like everything in Squashfs this fragment lookup table is itself stored
+ * compressed into metadata blocks.  A second index table is used to locate
+ * these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Look-up fragment using the fragment index table.  Return the on disk
+ * location of the fragment and its compressed size
+ */
+int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
+                                u64 *fragment_block)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int block = SQUASHFS_FRAGMENT_INDEX(fragment);
+        int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment);
+        u64 start_block = le64_to_cpu(msblk->fragment_index[block]);
+        struct squashfs_fragment_entry fragment_entry;
+        int size;
+        size = squashfs_read_metadata(sb, &fragment_entry, &start_block,
+                                        &offset, sizeof(fragment_entry));
+        if (size < 0)
+                return size;
+        *fragment_block = le64_to_cpu(fragment_entry.start_block);
+        size = le32_to_cpu(fragment_entry.size);
+        return size;
+}
+/*
+ * Read the uncompressed fragment lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_fragment_index_table(struct super_block *sb,
+        u64 fragment_table_start, unsigned int fragments)
+{
+        unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
+        __le64 *fragment_index;
+        int err;
+        /* Allocate fragment lookup table indexes */
+        fragment_index = kmalloc(length, GFP_KERNEL);
+        if (fragment_index == NULL) {
+                ERROR("Failed to allocate fragment index table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, fragment_index, fragment_table_start,
+                        length);
+        if (err < 0) {
+                ERROR("unable to read fragment index table\n");
+                kfree(fragment_index);
+                return ERR_PTR(err);
+        }
+        return fragment_index;
+}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
new file mode 100644
index 000000000000..3795b837ba28
--- /dev/null
+++ b/fs/squashfs/id.c
@@ -0,0 +1,94 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * id.c
+ */
+/*
+ * This file implements code to handle uids and gids.
+ *
+ * For space efficiency regular files store uid and gid indexes, which are
+ * converted to 32-bit uids/gids using an id look up table.  This table is
+ * stored compressed into metadata blocks.  A second index table is used to
+ * locate these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Map uid/gid index into real 32-bit uid/gid using the id look up table
+ */
+int squashfs_get_id(struct super_block *sb, unsigned int index,
+                                        unsigned int *id)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int block = SQUASHFS_ID_BLOCK(index);
+        int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
+        u64 start_block = le64_to_cpu(msblk->id_table[block]);
+        __le32 disk_id;
+        int err;
+        err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
+                                                        sizeof(disk_id));
+        if (err < 0)
+                return err;
+        *id = le32_to_cpu(disk_id);
+        return 0;
+}
+/*
+ * Read uncompressed id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_id_index_table(struct super_block *sb,
+                        u64 id_table_start, unsigned short no_ids)
+{
+        unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
+        __le64 *id_table;
+        int err;
+        TRACE("In read_id_index_table, length %d\n", length);
+        /* Allocate id lookup table indexes */
+        id_table = kmalloc(length, GFP_KERNEL);
+        if (id_table == NULL) {
+                ERROR("Failed to allocate id index table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, id_table, id_table_start, length);
+        if (err < 0) {
+                ERROR("unable to read id index table\n");
+                kfree(id_table);
+                return ERR_PTR(err);
+        }
+        return id_table;
+}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
new file mode 100644
index 000000000000..7a63398bb855
--- /dev/null
+++ b/fs/squashfs/inode.c
@@ -0,0 +1,346 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * inode.c
+ */
+/*
+ * This file implements code to create and read inodes from disk.
+ *
+ * Inodes in Squashfs are identified by a 48-bit inode which encodes the
+ * location of the compressed metadata block containing the inode, and the byte
+ * offset into that block where the inode is placed (<block, offset>).
+ *
+ * To maximise compression there are different inodes for each file type
+ * (regular file, directory, device, etc.), the inode contents and length
+ * varying with the type.
+ *
+ * To further maximise compression, two types of regular file inode and
+ * directory inode are defined: inodes optimised for frequently occurring
+ * regular files and directories, and extended types where extra
+ * information has to be stored.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Initialise VFS inode with the base inode information common to all
+ * Squashfs inode types.  Sqsh_ino contains the unswapped base inode
+ * off disk.
+ */
+static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
+                                struct squashfs_base_inode *sqsh_ino)
+{
+        int err;
+        err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid);
+        if (err)
+                return err;
+        err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid);
+        if (err)
+                return err;
+        inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
+        inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
+        inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
+        inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
+        inode->i_mode = le16_to_cpu(sqsh_ino->mode);
+        inode->i_size = 0;
+        return err;
+}
+struct inode *squashfs_iget(struct super_block *sb, long long ino,
+                                unsigned int ino_number)
+{
+        struct inode *inode = iget_locked(sb, ino_number);
+        int err;
+        TRACE("Entered squashfs_iget\n");
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        err = squashfs_read_inode(inode, ino);
+        if (err) {
+                iget_failed(inode);
+                return ERR_PTR(err);
+        }
+        unlock_new_inode(inode);
+        return inode;
+}
+/*
+ * Initialise VFS inode by reading inode from inode table (compressed
+ * metadata).  The format and amount of data read depends on type.
+ */
+int squashfs_read_inode(struct inode *inode, long long ino)
+{
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+        int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
+        union squashfs_inode squashfs_ino;
+        struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+        TRACE("Entered squashfs_read_inode\n");
+        /*
+         * Read inode base common to all inode types.
+         */
+        err = squashfs_read_metadata(sb, sqshb_ino, &block,
+                                &offset, sizeof(*sqshb_ino));
+        if (err < 0)
+                goto failed_read;
+        err = squashfs_new_inode(sb, inode, sqshb_ino);
+        if (err)
+                goto failed_read;
+        block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+        offset = SQUASHFS_INODE_OFFSET(ino);
+        type = le16_to_cpu(sqshb_ino->inode_type);
+        switch (type) {
+        case SQUASHFS_REG_TYPE: {
+                unsigned int frag_offset, frag_size, frag;
+                u64 frag_blk;
+                struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                                        sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                frag = le32_to_cpu(sqsh_ino->fragment);
+                if (frag != SQUASHFS_INVALID_FRAG) {
+                        frag_offset = le32_to_cpu(sqsh_ino->offset);
+                        frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+                        if (frag_size < 0) {
+                                err = frag_size;
+                                goto failed_read;
+                        }
+                } else {
+                        frag_blk = SQUASHFS_INVALID_BLK;
+                        frag_size = 0;
+                        frag_offset = 0;
+                }
+                inode->i_nlink = 1;
+                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+                inode->i_fop = &generic_ro_fops;
+                inode->i_mode |= S_IFREG;
+                inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
+                squashfs_i(inode)->fragment_block = frag_blk;
+                squashfs_i(inode)->fragment_size = frag_size;
+                squashfs_i(inode)->fragment_offset = frag_offset;
+                squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+                squashfs_i(inode)->block_list_start = block;
+                squashfs_i(inode)->offset = offset;
+                inode->i_data.a_ops = &squashfs_aops;
+                TRACE("File inode %x:%x, start_block %llx, block_list_start "
+                        "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+                        offset, squashfs_i(inode)->start, block, offset);
+                break;
+        }
+        case SQUASHFS_LREG_TYPE: {
+                unsigned int frag_offset, frag_size, frag;
+                u64 frag_blk;
+                struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                                        sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                frag = le32_to_cpu(sqsh_ino->fragment);
+                if (frag != SQUASHFS_INVALID_FRAG) {
+                        frag_offset = le32_to_cpu(sqsh_ino->offset);
+                        frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+                        if (frag_size < 0) {
+                                err = frag_size;
+                                goto failed_read;
+                        }
+                } else {
+                        frag_blk = SQUASHFS_INVALID_BLK;
+                        frag_size = 0;
+                        frag_offset = 0;
+                }
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+                inode->i_fop = &generic_ro_fops;
+                inode->i_mode |= S_IFREG;
+                inode->i_blocks = ((inode->i_size -
+                                le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+                squashfs_i(inode)->fragment_block = frag_blk;
+                squashfs_i(inode)->fragment_size = frag_size;
+                squashfs_i(inode)->fragment_offset = frag_offset;
+                squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
+                squashfs_i(inode)->block_list_start = block;
+                squashfs_i(inode)->offset = offset;
+                inode->i_data.a_ops = &squashfs_aops;
+                TRACE("File inode %x:%x, start_block %llx, block_list_start "
+                        "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+                        offset, squashfs_i(inode)->start, block, offset);
+                break;
+        }
+        case SQUASHFS_DIR_TYPE: {
+                struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                inode->i_size = le16_to_cpu(sqsh_ino->file_size);
+                inode->i_op = &squashfs_dir_inode_ops;
+                inode->i_fop = &squashfs_dir_ops;
+                inode->i_mode |= S_IFDIR;
+                squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+                squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+                squashfs_i(inode)->dir_idx_cnt = 0;
+                squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+                TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
+                                SQUASHFS_INODE_BLK(ino), offset,
+                                squashfs_i(inode)->start,
+                                le16_to_cpu(sqsh_ino->offset));
+                break;
+        }
+        case SQUASHFS_LDIR_TYPE: {
+                struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+                inode->i_op = &squashfs_dir_inode_ops;
+                inode->i_fop = &squashfs_dir_ops;
+                inode->i_mode |= S_IFDIR;
+                squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+                squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+                squashfs_i(inode)->dir_idx_start = block;
+                squashfs_i(inode)->dir_idx_offset = offset;
+                squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
+                squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+                TRACE("Long directory inode %x:%x, start_block %llx, offset "
+                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
+                                squashfs_i(inode)->start,
+                                le16_to_cpu(sqsh_ino->offset));
+                break;
+        }
+        case SQUASHFS_SYMLINK_TYPE:
+        case SQUASHFS_LSYMLINK_TYPE: {
+                struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
+                inode->i_op = &page_symlink_inode_operations;
+                inode->i_data.a_ops = &squashfs_symlink_aops;
+                inode->i_mode |= S_IFLNK;
+                squashfs_i(inode)->start = block;
+                squashfs_i(inode)->offset = offset;
+                TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
+                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
+                                block, offset);
+                break;
+        }
+        case SQUASHFS_BLKDEV_TYPE:
+        case SQUASHFS_CHRDEV_TYPE:
+        case SQUASHFS_LBLKDEV_TYPE:
+        case SQUASHFS_LCHRDEV_TYPE: {
+                struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
+                unsigned int rdev;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_CHRDEV_TYPE)
+                        inode->i_mode |= S_IFCHR;
+                else
+                        inode->i_mode |= S_IFBLK;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                rdev = le32_to_cpu(sqsh_ino->rdev);
+                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+                TRACE("Device inode %x:%x, rdev %x\n",
+                                SQUASHFS_INODE_BLK(ino), offset, rdev);
+                break;
+        }
+        case SQUASHFS_FIFO_TYPE:
+        case SQUASHFS_SOCKET_TYPE:
+        case SQUASHFS_LFIFO_TYPE:
+        case SQUASHFS_LSOCKET_TYPE: {
+                struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_FIFO_TYPE)
+                        inode->i_mode |= S_IFIFO;
+                else
+                        inode->i_mode |= S_IFSOCK;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                init_special_inode(inode, inode->i_mode, 0);
+                break;
+        }
+        default:
+                ERROR("Unknown inode type %d in squashfs_iget!\n", type);
+                return -EINVAL;
+        }
+        return 0;
+failed_read:
+        ERROR("Unable to read inode 0x%llx\n", ino);
+        return err;
+}
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
new file mode 100644
index 000000000000..9e398653b22b
--- /dev/null
+++ b/fs/squashfs/namei.c
@@ -0,0 +1,242 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * namei.c
+ */
+/*
+ * This file implements code to do filename lookup in directories.
+ *
+ * Like inodes, directories are packed into compressed metadata blocks, stored
+ * in a directory table.  Directories are accessed using the start address of
+ * the metablock containing the directory and the offset into the
+ * decompressed block (<block, offset>).
+ *
+ * Directories are organised in a slightly complex way, and are not simply
+ * a list of file names.  The organisation takes advantage of the
+ * fact that (in most cases) the inodes of the files will be in the same
+ * compressed metadata block, and therefore, can share the start block.
+ * Directories are therefore organised in a two level list, a directory
+ * header containing the shared start block value, and a sequence of directory
+ * entries, each of which share the shared start block.  A new directory header
+ * is written once/if the inode start block changes.  The directory
+ * header/directory entry list is repeated as many times as necessary.
+ *
+ * Directories are sorted, and can contain a directory index to speed up
+ * file lookup.  Directory indexes store one entry per metablock, each entry
+ * storing the index/filename mapping to the first directory header
+ * in each metadata block.  Directories are sorted in alphabetical order,
+ * and at lookup the index is scanned linearly looking for the first filename
+ * alphabetically larger than the filename being looked up.  At this point the
+ * location of the metadata block the filename is in has been found.
+ * The general idea of the index is ensure only one metadata block needs to be
+ * decompressed to do a lookup irrespective of the length of the directory.
+ * This scheme has the advantage that it doesn't require extra memory overhead
+ * and doesn't require much extra storage on disk.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Lookup name in the directory index, returning the location of the metadata
+ * block containing it, and the directory index this represents.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_name(struct super_block *sb,
+                        u64 *next_block, int *next_offset, u64 index_start,
+                        int index_offset, int i_count, const char *name,
+                        int len)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int i, size, length = 0, err;
+        struct squashfs_dir_index *index;
+        char *str;
+        TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
+        index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
+        if (index == NULL) {
+                ERROR("Failed to allocate squashfs_dir_index\n");
+                goto out;
+        }
+        str = &index->name[SQUASHFS_NAME_LEN + 1];
+        strncpy(str, name, len);
+        str[len] = '\0';
+        for (i = 0; i < i_count; i++) {
+                err = squashfs_read_metadata(sb, index, &index_start,
+                                        &index_offset, sizeof(*index));
+                if (err < 0)
+                        break;
+                size = le32_to_cpu(index->size) + 1;
+                err = squashfs_read_metadata(sb, index->name, &index_start,
+                                        &index_offset, size);
+                if (err < 0)
+                        break;
+                index->name[size] = '\0';
+                if (strcmp(index->name, str) > 0)
+                        break;
+                length = le32_to_cpu(index->index);
+                *next_block = le32_to_cpu(index->start_block) +
+                                        msblk->directory_table;
+        }
+        *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+        kfree(index);
+out:
+        /*
+         * Return index (f_pos) of the looked up metadata block.  Translate
+         * from internal f_pos to external f_pos which is offset by 3 because
+         * we invent "." and ".." entries which are not actually stored in the
+         * directory.
+         */
+        return length + 3;
+}
+static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
+                                 struct nameidata *nd)
+{
+        const unsigned char *name = dentry->d_name.name;
+        int len = dentry->d_name.len;
+        struct inode *inode = NULL;
+        struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info;
+        struct squashfs_dir_header dirh;
+        struct squashfs_dir_entry *dire;
+        u64 block = squashfs_i(dir)->start + msblk->directory_table;
+        int offset = squashfs_i(dir)->offset;
+        int err, length = 0, dir_count, size;
+        TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
+        dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+        if (dire == NULL) {
+                ERROR("Failed to allocate squashfs_dir_entry\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        if (len > SQUASHFS_NAME_LEN) {
+                err = -ENAMETOOLONG;
+                goto failed;
+        }
+        length = get_dir_index_using_name(dir->i_sb, &block, &offset,
+                                squashfs_i(dir)->dir_idx_start,
+                                squashfs_i(dir)->dir_idx_offset,
+                                squashfs_i(dir)->dir_idx_cnt, name, len);
+        while (length < i_size_read(dir)) {
+                /*
+                 * Read directory header.
+                 */
+                err = squashfs_read_metadata(dir->i_sb, &dirh, &block,
+                                &offset, sizeof(dirh));
+                if (err < 0)
+                        goto read_failure;
+                length += sizeof(dirh);
+                dir_count = le32_to_cpu(dirh.count) + 1;
+                while (dir_count--) {
+                        /*
+                         * Read directory entry.
+                         */
+                        err = squashfs_read_metadata(dir->i_sb, dire, &block,
+                                        &offset, sizeof(*dire));
+                        if (err < 0)
+                                goto read_failure;
+                        size = le16_to_cpu(dire->size) + 1;
+                        err = squashfs_read_metadata(dir->i_sb, dire->name,
+                                        &block, &offset, size);
+                        if (err < 0)
+                                goto read_failure;
+                        length += sizeof(*dire) + size;
+                        if (name[0] < dire->name[0])
+                                goto exit_lookup;
+                        if (len == size && !strncmp(name, dire->name, len)) {
+                                unsigned int blk, off, ino_num;
+                                long long ino;
+                                blk = le32_to_cpu(dirh.start_block);
+                                off = le16_to_cpu(dire->offset);
+                                ino_num = le32_to_cpu(dirh.inode_number) +
+                                        (short) le16_to_cpu(dire->inode_number);
+                                ino = SQUASHFS_MKINODE(blk, off);
+                                TRACE("calling squashfs_iget for directory "
+                                        "entry %s, inode  %x:%x, %d\n", name,
+                                        blk, off, ino_num);
+                                inode = squashfs_iget(dir->i_sb, ino, ino_num);
+                                if (IS_ERR(inode)) {
+                                        err = PTR_ERR(inode);
+                                        goto failed;
+                                }
+                                goto exit_lookup;
+                        }
+                }
+        }
+exit_lookup:
+        kfree(dire);
+        if (inode)
+                return d_splice_alias(inode, dentry);
+        d_add(dentry, inode);
+        return ERR_PTR(0);
+read_failure:
+        ERROR("Unable to read directory block [%llx:%x]\n",
+                squashfs_i(dir)->start + msblk->directory_table,
+                squashfs_i(dir)->offset);
+failed:
+        kfree(dire);
+        return ERR_PTR(err);
+}
+const struct inode_operations squashfs_dir_inode_ops = {
+        .lookup = squashfs_lookup
+};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
new file mode 100644
index 000000000000..6b2515d027d5
--- /dev/null
+++ b/fs/squashfs/squashfs.h
@@ -0,0 +1,90 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs.h
+ */
+#define TRACE(s, args...)       pr_debug("SQUASHFS: "s, ## args)
+#define ERROR(s, args...)       pr_err("SQUASHFS error: "s, ## args)
+#define WARNING(s, args...)     pr_warning("SQUASHFS: "s, ## args)
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+        return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
+/* block.c */
+extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
+                                int);
+/* cache.c */
+extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
+extern void squashfs_cache_delete(struct squashfs_cache *);
+extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *,
+                                struct squashfs_cache *, u64, int);
+extern void squashfs_cache_put(struct squashfs_cache_entry *);
+extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int);
+extern int squashfs_read_metadata(struct super_block *, void *, u64 *,
+                                int *, int);
+extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
+                                u64, int);
+extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
+                                u64, int);
+extern int squashfs_read_table(struct super_block *, void *, u64, int);
+/* export.c */
+extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
+                                unsigned int);
+/* fragment.c */
+extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
+extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
+                                u64, unsigned int);
+/* id.c */
+extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
+extern __le64 *squashfs_read_id_index_table(struct super_block *, u64,
+                                unsigned short);
+/* inode.c */
+extern struct inode *squashfs_iget(struct super_block *, long long,
+                                unsigned int);
+extern int squashfs_read_inode(struct inode *, long long);
+/*
+ * Inodes and files operations
+ */
+/* dir.c */
+extern const struct file_operations squashfs_dir_ops;
+/* export.c */
+extern const struct export_operations squashfs_export_ops;
+/* file.c */
+extern const struct address_space_operations squashfs_aops;
+/* namei.c */
+extern const struct inode_operations squashfs_dir_inode_ops;
+/* symlink.c */
+extern const struct address_space_operations squashfs_symlink_aops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
new file mode 100644
index 000000000000..283daafc568e
--- /dev/null
+++ b/fs/squashfs/squashfs_fs.h
@@ -0,0 +1,380 @@
+#ifndef SQUASHFS_FS
+#define SQUASHFS_FS
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs.h
+ */
+#define SQUASHFS_CACHED_FRAGMENTS       CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
+#define SQUASHFS_MAJOR                  4
+#define SQUASHFS_MINOR                  0
+#define SQUASHFS_START                  0
+/* size of metadata (inode and directory) blocks */
+#define SQUASHFS_METADATA_SIZE          8192
+#define SQUASHFS_METADATA_LOG           13
+/* default size of data blocks */
+#define SQUASHFS_FILE_SIZE              131072
+#define SQUASHFS_FILE_LOG               17
+#define SQUASHFS_FILE_MAX_SIZE          1048576
+#define SQUASHFS_FILE_MAX_LOG           20
+/* Max number of uids and gids */
+#define SQUASHFS_IDS                    65536
+/* Max length of filename (not 255) */
+#define SQUASHFS_NAME_LEN               256
+#define SQUASHFS_INVALID_FRAG           (0xffffffffU)
+#define SQUASHFS_INVALID_BLK            (-1LL)
+/* Filesystem flags */
+#define SQUASHFS_NOI                    0
+#define SQUASHFS_NOD                    1
+#define SQUASHFS_NOF                    3
+#define SQUASHFS_NO_FRAG                4
+#define SQUASHFS_ALWAYS_FRAG            5
+#define SQUASHFS_DUPLICATE              6
+#define SQUASHFS_EXPORT                 7
+#define SQUASHFS_BIT(flag, bit)         ((flag >> bit) & 1)
+#define SQUASHFS_UNCOMPRESSED_INODES(flags)     SQUASHFS_BIT(flags, \
+                                                SQUASHFS_NOI)
+#define SQUASHFS_UNCOMPRESSED_DATA(flags)       SQUASHFS_BIT(flags, \
+                                                SQUASHFS_NOD)
+#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags)  SQUASHFS_BIT(flags, \
+                                                SQUASHFS_NOF)
+#define SQUASHFS_NO_FRAGMENTS(flags)            SQUASHFS_BIT(flags, \
+                                                SQUASHFS_NO_FRAG)
+#define SQUASHFS_ALWAYS_FRAGMENTS(flags)        SQUASHFS_BIT(flags, \
+                                                SQUASHFS_ALWAYS_FRAG)
+#define SQUASHFS_DUPLICATES(flags)              SQUASHFS_BIT(flags, \
+                                                SQUASHFS_DUPLICATE)
+#define SQUASHFS_EXPORTABLE(flags)              SQUASHFS_BIT(flags, \
+                                                SQUASHFS_EXPORT)
+/* Max number of types and file types */
+#define SQUASHFS_DIR_TYPE               1
+#define SQUASHFS_REG_TYPE               2
+#define SQUASHFS_SYMLINK_TYPE           3
+#define SQUASHFS_BLKDEV_TYPE            4
+#define SQUASHFS_CHRDEV_TYPE            5
+#define SQUASHFS_FIFO_TYPE              6
+#define SQUASHFS_SOCKET_TYPE            7
+#define SQUASHFS_LDIR_TYPE              8
+#define SQUASHFS_LREG_TYPE              9
+#define SQUASHFS_LSYMLINK_TYPE          10
+#define SQUASHFS_LBLKDEV_TYPE           11
+#define SQUASHFS_LCHRDEV_TYPE           12
+#define SQUASHFS_LFIFO_TYPE             13
+#define SQUASHFS_LSOCKET_TYPE           14
+/* Flag whether block is compressed or uncompressed, bit is set if block is
+ * uncompressed */
+#define SQUASHFS_COMPRESSED_BIT         (1 << 15)
+#define SQUASHFS_COMPRESSED_SIZE(B)     (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \
+                (B) & ~SQUASHFS_COMPRESSED_BIT :  SQUASHFS_COMPRESSED_BIT)
+#define SQUASHFS_COMPRESSED(B)          (!((B) & SQUASHFS_COMPRESSED_BIT))
+#define SQUASHFS_COMPRESSED_BIT_BLOCK   (1 << 24)
+#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B)       ((B) & \
+                                                ~SQUASHFS_COMPRESSED_BIT_BLOCK)
+#define SQUASHFS_COMPRESSED_BLOCK(B)    (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK))
+/*
+ * Inode number ops.  Inodes consist of a compressed block number, and an
+ * uncompressed offset within that block
+ */
+#define SQUASHFS_INODE_BLK(A)           ((unsigned int) ((A) >> 16))
+#define SQUASHFS_INODE_OFFSET(A)        ((unsigned int) ((A) & 0xffff))
+#define SQUASHFS_MKINODE(A, B)          ((long long)(((long long) (A)\
+                                        << 16) + (B)))
+/* Translate between VFS mode and squashfs mode */
+#define SQUASHFS_MODE(A)                ((A) & 0xfff)
+/* fragment and fragment table defines */
+#define SQUASHFS_FRAGMENT_BYTES(A)      \
+                                ((A) * sizeof(struct squashfs_fragment_entry))
+#define SQUASHFS_FRAGMENT_INDEX(A)      (SQUASHFS_FRAGMENT_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A)       (SQUASHFS_FRAGMENT_BYTES(A) % \
+                                                SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_FRAGMENT_INDEXES(A)    ((SQUASHFS_FRAGMENT_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_FRAGMENT_INDEX_BYTES(A)        (SQUASHFS_FRAGMENT_INDEXES(A) *\
+                                                sizeof(u64))
+/* inode lookup table defines */
+#define SQUASHFS_LOOKUP_BYTES(A)        ((A) * sizeof(u64))
+#define SQUASHFS_LOOKUP_BLOCK(A)        (SQUASHFS_LOOKUP_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_LOOKUP_BLOCKS(A)       ((SQUASHFS_LOOKUP_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_LOOKUP_BLOCK_BYTES(A)  (SQUASHFS_LOOKUP_BLOCKS(A) *\
+                                        sizeof(u64))
+/* uid/gid lookup table defines */
+#define SQUASHFS_ID_BYTES(A)            ((A) * sizeof(unsigned int))
+#define SQUASHFS_ID_BLOCK(A)            (SQUASHFS_ID_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_ID_BLOCK_OFFSET(A)     (SQUASHFS_ID_BYTES(A) % \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_ID_BLOCKS(A)           ((SQUASHFS_ID_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_ID_BLOCK_BYTES(A)      (SQUASHFS_ID_BLOCKS(A) *\
+                                        sizeof(u64))
+/* cached data constants for filesystem */
+#define SQUASHFS_CACHED_BLKS            8
+#define SQUASHFS_MAX_FILE_SIZE_LOG      64
+#define SQUASHFS_MAX_FILE_SIZE          (1LL << \
+                                        (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
+#define SQUASHFS_MARKER_BYTE            0xff
+/* meta index cache */
+#define SQUASHFS_META_INDEXES   (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
+#define SQUASHFS_META_ENTRIES   127
+#define SQUASHFS_META_SLOTS     8
+struct meta_entry {
+        u64                     data_block;
+        unsigned int            index_block;
+        unsigned short          offset;
+        unsigned short          pad;
+};
+struct meta_index {
+        unsigned int            inode_number;
+        unsigned int            offset;
+        unsigned short          entries;
+        unsigned short          skip;
+        unsigned short          locked;
+        unsigned short          pad;
+        struct meta_entry       meta_entry[SQUASHFS_META_ENTRIES];
+};
+/*
+ * definitions for structures on disk
+ */
+#define ZLIB_COMPRESSION         1
+struct squashfs_super_block {
+        __le32                  s_magic;
+        __le32                  inodes;
+        __le32                  mkfs_time;
+        __le32                  block_size;
+        __le32                  fragments;
+        __le16                  compression;
+        __le16                  block_log;
+        __le16                  flags;
+        __le16                  no_ids;
+        __le16                  s_major;
+        __le16                  s_minor;
+        __le64                  root_inode;
+        __le64                  bytes_used;
+        __le64                  id_table_start;
+        __le64                  xattr_table_start;
+        __le64                  inode_table_start;
+        __le64                  directory_table_start;
+        __le64                  fragment_table_start;
+        __le64                  lookup_table_start;
+};
+struct squashfs_dir_index {
+        __le32                  index;
+        __le32                  start_block;
+        __le32                  size;
+        unsigned char           name[0];
+};
+struct squashfs_base_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+};
+struct squashfs_ipc_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+};
+struct squashfs_dev_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  rdev;
+};
+struct squashfs_symlink_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  symlink_size;
+        char                    symlink[0];
+};
+struct squashfs_reg_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  start_block;
+        __le32                  fragment;
+        __le32                  offset;
+        __le32                  file_size;
+        __le16                  block_list[0];
+};
+struct squashfs_lreg_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le64                  start_block;
+        __le64                  file_size;
+        __le64                  sparse;
+        __le32                  nlink;
+        __le32                  fragment;
+        __le32                  offset;
+        __le32                  xattr;
+        __le16                  block_list[0];
+};
+struct squashfs_dir_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  start_block;
+        __le32                  nlink;
+        __le16                  file_size;
+        __le16                  offset;
+        __le32                  parent_inode;
+};
+struct squashfs_ldir_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  file_size;
+        __le32                  start_block;
+        __le32                  parent_inode;
+        __le16                  i_count;
+        __le16                  offset;
+        __le32                  xattr;
+        struct squashfs_dir_index       index[0];
+};
+union squashfs_inode {
+        struct squashfs_base_inode              base;
+        struct squashfs_dev_inode               dev;
+        struct squashfs_symlink_inode           symlink;
+        struct squashfs_reg_inode               reg;
+        struct squashfs_lreg_inode              lreg;
+        struct squashfs_dir_inode               dir;
+        struct squashfs_ldir_inode              ldir;
+        struct squashfs_ipc_inode               ipc;
+};
+struct squashfs_dir_entry {
+        __le16                  offset;
+        __le16                  inode_number;
+        __le16                  type;
+        __le16                  size;
+        char                    name[0];
+};
+struct squashfs_dir_header {
+        __le32                  count;
+        __le32                  start_block;
+        __le32                  inode_number;
+};
+struct squashfs_fragment_entry {
+        __le64                  start_block;
+        __le32                  size;
+        unsigned int            unused;
+};
+#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
new file mode 100644
index 000000000000..fbfca30c0c68
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -0,0 +1,45 @@
+#ifndef SQUASHFS_FS_I
+#define SQUASHFS_FS_I
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_i.h
+ */
+struct squashfs_inode_info {
+        u64             start;
+        int             offset;
+        union {
+                struct {
+                        u64             fragment_block;
+                        int             fragment_size;
+                        int             fragment_offset;
+                        u64             block_list_start;
+                };
+                struct {
+                        u64             dir_idx_start;
+                        int             dir_idx_offset;
+                        int             dir_idx_cnt;
+                        int             parent;
+                };
+        };
+        struct inode    vfs_inode;
+};
+#endif
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
new file mode 100644
index 000000000000..c8c65614dd1c
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -0,0 +1,76 @@
+#ifndef SQUASHFS_FS_SB
+#define SQUASHFS_FS_SB
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_sb.h
+ */
+#include "squashfs_fs.h"
+struct squashfs_cache {
+        char                    *name;
+        int                     entries;
+        int                     next_blk;
+        int                     num_waiters;
+        int                     unused;
+        int                     block_size;
+        int                     pages;
+        spinlock_t              lock;
+        wait_queue_head_t       wait_queue;
+        struct squashfs_cache_entry *entry;
+};
+struct squashfs_cache_entry {
+        u64                     block;
+        int                     length;
+        int                     refcount;
+        u64                     next_index;
+        int                     pending;
+        int                     error;
+        int                     num_waiters;
+        wait_queue_head_t       wait_queue;
+        struct squashfs_cache   *cache;
+        void                    **data;
+};
+struct squashfs_sb_info {
+        int                     devblksize;
+        int                     devblksize_log2;
+        struct squashfs_cache   *block_cache;
+        struct squashfs_cache   *fragment_cache;
+        struct squashfs_cache   *read_page;
+        int                     next_meta_index;
+        __le64                  *id_table;
+        __le64                  *fragment_index;
+        unsigned int            *fragment_index_2;
+        struct mutex            read_data_mutex;
+        struct mutex            meta_index_mutex;
+        struct meta_index       *meta_index;
+        z_stream                stream;
+        __le64                  *inode_lookup_table;
+        u64                     inode_table;
+        u64                     directory_table;
+        unsigned int            block_size;
+        unsigned short          block_log;
+        long long               bytes_used;
+        unsigned int            inodes;
+};
+#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
new file mode 100644
index 000000000000..071df5b5b491
--- /dev/null
+++ b/fs/squashfs/super.c
@@ -0,0 +1,441 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * super.c
+ */
+/*
+ * This file implements code to read the superblock, read and initialise
+ * in-memory structures at mount time, and all the VFS glue code to register
+ * the filesystem.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/zlib.h>
+#include <linux/magic.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static struct file_system_type squashfs_fs_type;
+static struct super_operations squashfs_super_ops;
+static int supported_squashfs_filesystem(short major, short minor, short comp)
+{
+        if (major < SQUASHFS_MAJOR) {
+                ERROR("Major/Minor mismatch, older Squashfs %d.%d "
+                        "filesystems are unsupported\n", major, minor);
+                return -EINVAL;
+        } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
+                ERROR("Major/Minor mismatch, trying to mount newer "
+                        "%d.%d filesystem\n", major, minor);
+                ERROR("Please update your kernel\n");
+                return -EINVAL;
+        }
+        if (comp != ZLIB_COMPRESSION)
+                return -EINVAL;
+        return 0;
+}
+static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct squashfs_sb_info *msblk;
+        struct squashfs_super_block *sblk = NULL;
+        char b[BDEVNAME_SIZE];
+        struct inode *root;
+        long long root_inode;
+        unsigned short flags;
+        unsigned int fragments;
+        u64 lookup_table_start;
+        int err;
+        TRACE("Entered squashfs_fill_superblock\n");
+        sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
+        if (sb->s_fs_info == NULL) {
+                ERROR("Failed to allocate squashfs_sb_info\n");
+                return -ENOMEM;
+        }
+        msblk = sb->s_fs_info;
+        msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
+                GFP_KERNEL);
+        if (msblk->stream.workspace == NULL) {
+                ERROR("Failed to allocate zlib workspace\n");
+                goto failure;
+        }
+        sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
+        if (sblk == NULL) {
+                ERROR("Failed to allocate squashfs_super_block\n");
+                goto failure;
+        }
+        msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
+        msblk->devblksize_log2 = ffz(~msblk->devblksize);
+        mutex_init(&msblk->read_data_mutex);
+        mutex_init(&msblk->meta_index_mutex);
+        /*
+         * msblk->bytes_used is checked in squashfs_read_table to ensure reads
+         * are not beyond filesystem end.  But as we're using
+         * squashfs_read_table here to read the superblock (including the value
+         * of bytes_used) we need to set it to an initial sensible dummy value
+         */
+        msblk->bytes_used = sizeof(*sblk);
+        err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk));
+        if (err < 0) {
+                ERROR("unable to read squashfs_super_block\n");
+                goto failed_mount;
+        }
+        /* Check it is a SQUASHFS superblock */
+        sb->s_magic = le32_to_cpu(sblk->s_magic);
+        if (sb->s_magic != SQUASHFS_MAGIC) {
+                if (!silent)
+                        ERROR("Can't find a SQUASHFS superblock on %s\n",
+                                                bdevname(sb->s_bdev, b));
+                err = -EINVAL;
+                goto failed_mount;
+        }
+        /* Check the MAJOR & MINOR versions and compression type */
+        err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
+                        le16_to_cpu(sblk->s_minor),
+                        le16_to_cpu(sblk->compression));
+        if (err < 0)
+                goto failed_mount;
+        err = -EINVAL;
+        /*
+         * Check if there's xattrs in the filesystem.  These are not
+         * supported in this version, so warn that they will be ignored.
+         */
+        if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
+                ERROR("Xattrs in filesystem, these will be ignored\n");
+        /* Check the filesystem does not extend beyond the end of the
+           block device */
+        msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
+        if (msblk->bytes_used < 0 || msblk->bytes_used >
+                        i_size_read(sb->s_bdev->bd_inode))
+                goto failed_mount;
+        /* Check block size for sanity */
+        msblk->block_size = le32_to_cpu(sblk->block_size);
+        if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
+                goto failed_mount;
+        msblk->block_log = le16_to_cpu(sblk->block_log);
+        if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
+                goto failed_mount;
+        /* Check the root inode for sanity */
+        root_inode = le64_to_cpu(sblk->root_inode);
+        if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
+                goto failed_mount;
+        msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
+        msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
+        msblk->inodes = le32_to_cpu(sblk->inodes);
+        flags = le16_to_cpu(sblk->flags);
+        TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+        TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
+                                ? "un" : "");
+        TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
+                                ? "un" : "");
+        TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
+        TRACE("Block size %d\n", msblk->block_size);
+        TRACE("Number of inodes %d\n", msblk->inodes);
+        TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments));
+        TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
+        TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
+        TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
+        TRACE("sblk->fragment_table_start %llx\n",
+                (u64) le64_to_cpu(sblk->fragment_table_start));
+        TRACE("sblk->id_table_start %llx\n",
+                (u64) le64_to_cpu(sblk->id_table_start));
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_flags |= MS_RDONLY;
+        sb->s_op = &squashfs_super_ops;
+        err = -ENOMEM;
+        msblk->block_cache = squashfs_cache_init("metadata",
+                        SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
+        if (msblk->block_cache == NULL)
+                goto failed_mount;
+        /* Allocate read_page block */
+        msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
+        if (msblk->read_page == NULL) {
+                ERROR("Failed to allocate read_page block\n");
+                goto failed_mount;
+        }
+        /* Allocate and read id index table */
+        msblk->id_table = squashfs_read_id_index_table(sb,
+                le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
+        if (IS_ERR(msblk->id_table)) {
+                err = PTR_ERR(msblk->id_table);
+                msblk->id_table = NULL;
+                goto failed_mount;
+        }
+        fragments = le32_to_cpu(sblk->fragments);
+        if (fragments == 0)
+                goto allocate_lookup_table;
+        msblk->fragment_cache = squashfs_cache_init("fragment",
+                SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
+        if (msblk->fragment_cache == NULL) {
+                err = -ENOMEM;
+                goto failed_mount;
+        }
+        /* Allocate and read fragment index table */
+        msblk->fragment_index = squashfs_read_fragment_index_table(sb,
+                le64_to_cpu(sblk->fragment_table_start), fragments);
+        if (IS_ERR(msblk->fragment_index)) {
+                err = PTR_ERR(msblk->fragment_index);
+                msblk->fragment_index = NULL;
+                goto failed_mount;
+        }
+allocate_lookup_table:
+        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+        if (lookup_table_start == SQUASHFS_INVALID_BLK)
+                goto allocate_root;
+        /* Allocate and read inode lookup table */
+        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
+                lookup_table_start, msblk->inodes);
+        if (IS_ERR(msblk->inode_lookup_table)) {
+                err = PTR_ERR(msblk->inode_lookup_table);
+                msblk->inode_lookup_table = NULL;
+                goto failed_mount;
+        }
+        sb->s_export_op = &squashfs_export_ops;
+allocate_root:
+        root = new_inode(sb);
+        if (!root) {
+                err = -ENOMEM;
+                goto failed_mount;
+        }
+        err = squashfs_read_inode(root, root_inode);
+        if (err) {
+                iget_failed(root);
+                goto failed_mount;
+        }
+        insert_inode_hash(root);
+        sb->s_root = d_alloc_root(root);
+        if (sb->s_root == NULL) {
+                ERROR("Root inode create failed\n");
+                err = -ENOMEM;
+                iput(root);
+                goto failed_mount;
+        }
+        TRACE("Leaving squashfs_fill_super\n");
+        kfree(sblk);
+        return 0;
+failed_mount:
+        squashfs_cache_delete(msblk->block_cache);
+        squashfs_cache_delete(msblk->fragment_cache);
+        squashfs_cache_delete(msblk->read_page);
+        kfree(msblk->inode_lookup_table);
+        kfree(msblk->fragment_index);
+        kfree(msblk->id_table);
+        kfree(msblk->stream.workspace);
+        kfree(sb->s_fs_info);
+        sb->s_fs_info = NULL;
+        kfree(sblk);
+        return err;
+failure:
+        kfree(msblk->stream.workspace);
+        kfree(sb->s_fs_info);
+        sb->s_fs_info = NULL;
+        return -ENOMEM;
+}
+static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+        TRACE("Entered squashfs_statfs\n");
+        buf->f_type = SQUASHFS_MAGIC;
+        buf->f_bsize = msblk->block_size;
+        buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
+        buf->f_bfree = buf->f_bavail = 0;
+        buf->f_files = msblk->inodes;
+        buf->f_ffree = 0;
+        buf->f_namelen = SQUASHFS_NAME_LEN;
+        return 0;
+}
+static int squashfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        *flags |= MS_RDONLY;
+        return 0;
+}
+static void squashfs_put_super(struct super_block *sb)
+{
+        if (sb->s_fs_info) {
+                struct squashfs_sb_info *sbi = sb->s_fs_info;
+                squashfs_cache_delete(sbi->block_cache);
+                squashfs_cache_delete(sbi->fragment_cache);
+                squashfs_cache_delete(sbi->read_page);
+                kfree(sbi->id_table);
+                kfree(sbi->fragment_index);
+                kfree(sbi->meta_index);
+                kfree(sbi->stream.workspace);
+                kfree(sb->s_fs_info);
+                sb->s_fs_info = NULL;
+        }
+}
+static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
+                                const char *dev_name, void *data,
+                                struct vfsmount *mnt)
+{
+        return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
+                                mnt);
+}
+static struct kmem_cache *squashfs_inode_cachep;
+static void init_once(void *foo)
+{
+        struct squashfs_inode_info *ei = foo;
+        inode_init_once(&ei->vfs_inode);
+}
+static int __init init_inodecache(void)
+{
+        squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
+                sizeof(struct squashfs_inode_info), 0,
+                SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+        return squashfs_inode_cachep ? 0 : -ENOMEM;
+}
+static void destroy_inodecache(void)
+{
+        kmem_cache_destroy(squashfs_inode_cachep);
+}
+static int __init init_squashfs_fs(void)
+{
+        int err = init_inodecache();
+        if (err)
+                return err;
+        err = register_filesystem(&squashfs_fs_type);
+        if (err) {
+                destroy_inodecache();
+                return err;
+        }
+        printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
+                "Phillip Lougher\n");
+        return 0;
+}
+static void __exit exit_squashfs_fs(void)
+{
+        unregister_filesystem(&squashfs_fs_type);
+        destroy_inodecache();
+}
+static struct inode *squashfs_alloc_inode(struct super_block *sb)
+{
+        struct squashfs_inode_info *ei =
+                kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
+        return ei ? &ei->vfs_inode : NULL;
+}
+static void squashfs_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
+}
+static struct file_system_type squashfs_fs_type = {
+        .owner = THIS_MODULE,
+        .name = "squashfs",
+        .get_sb = squashfs_get_sb,
+        .kill_sb = kill_block_super,
+        .fs_flags = FS_REQUIRES_DEV
+};
+static struct super_operations squashfs_super_ops = {
+        .alloc_inode = squashfs_alloc_inode,
+        .destroy_inode = squashfs_destroy_inode,
+        .statfs = squashfs_statfs,
+        .put_super = squashfs_put_super,
+        .remount_fs = squashfs_remount
+};
+module_init(init_squashfs_fs);
+module_exit(exit_squashfs_fs);
+MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
+MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>");
+MODULE_LICENSE("GPL");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
new file mode 100644
index 000000000000..83d87880aac8
--- /dev/null
+++ b/fs/squashfs/symlink.c
@@ -0,0 +1,118 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * symlink.c
+ */
+/*
+ * This file implements code to handle symbolic links.
+ *
+ * The data contents of symbolic links are stored inside the symbolic
+ * link inode within the inode table.  This allows the normally small symbolic
+ * link to be compressed as part of the inode table, achieving much greater
+ * compression than if the symbolic link was compressed individually.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static int squashfs_symlink_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int index = page->index << PAGE_CACHE_SHIFT;
+        u64 block = squashfs_i(inode)->start;
+        int offset = squashfs_i(inode)->offset;
+        int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
+        int bytes, copied;
+        void *pageaddr;
+        struct squashfs_cache_entry *entry;
+        TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
+                        "%llx, offset %x\n", page->index, block, offset);
+        /*
+         * Skip index bytes into symlink metadata.
+         */
+        if (index) {
+                bytes = squashfs_read_metadata(sb, NULL, &block, &offset,
+                                                                index);
+                if (bytes < 0) {
+                        ERROR("Unable to read symlink [%llx:%x]\n",
+                                squashfs_i(inode)->start,
+                                squashfs_i(inode)->offset);
+                        goto error_out;
+                }
+        }
+        /*
+         * Read length bytes from symlink metadata.  Squashfs_read_metadata
+         * is not used here because it can sleep and we want to use
+         * kmap_atomic to map the page.  Instead call the underlying
+         * squashfs_cache_get routine.  As length bytes may overlap metadata
+         * blocks, we may need to call squashfs_cache_get multiple times.
+         */
+        for (bytes = 0; bytes < length; offset = 0, bytes += copied) {
+                entry = squashfs_cache_get(sb, msblk->block_cache, block, 0);
+                if (entry->error) {
+                        ERROR("Unable to read symlink [%llx:%x]\n",
+                                squashfs_i(inode)->start,
+                                squashfs_i(inode)->offset);
+                        squashfs_cache_put(entry);
+                        goto error_out;
+                }
+                pageaddr = kmap_atomic(page, KM_USER0);
+                copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
+                                                                length - bytes);
+                if (copied == length - bytes)
+                        memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
+                else
+                        block = entry->next_index;
+                kunmap_atomic(pageaddr, KM_USER0);
+                squashfs_cache_put(entry);
+        }
+        flush_dcache_page(page);
+        SetPageUptodate(page);
+        unlock_page(page);
+        return 0;
+error_out:
+        SetPageError(page);
+        unlock_page(page);
+        return 0;
+}
+const struct address_space_operations squashfs_symlink_aops = {
+        .readpage = squashfs_symlink_readpage
+};
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b76..2db740a0cfb5 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -152,7 +152,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
-asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user * statbuf)
+SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
@@ -162,7 +162,8 @@ asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user
        return error;
 }
-asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __user * statbuf)
+SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
@@ -172,7 +173,8 @@ asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __use
        return error;
 }
-asmlinkage long sys_fstat(unsigned int fd, struct __old_kernel_stat __user * statbuf)
+SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);
@@ -235,7 +237,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
-asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
+SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
@@ -246,7 +248,7 @@ asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
        return error;
 }
-asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
+SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
@@ -258,8 +260,8 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
 }
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
-asmlinkage long sys_newfstatat(int dfd, char __user *filename,
+SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
-                                struct stat __user *statbuf, int flag)
+                struct stat __user *, statbuf, int, flag)
 {
        struct kstat stat;
        int error = -EINVAL;
@@ -280,7 +282,7 @@ out:
 }
 #endif
-asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
+SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);
@@ -291,8 +293,8 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
        return error;
 }
-asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
+SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
-                                char __user *buf, int bufsiz)
+                char __user *, buf, int, bufsiz)
 {
        struct path path;
        int error;
@@ -305,7 +307,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
                struct inode *inode = path.dentry->d_inode;
                error = -EINVAL;
-                if (inode->i_op && inode->i_op->readlink) {
+                if (inode->i_op->readlink) {
                        error = security_inode_readlink(path.dentry);
                        if (!error) {
                                touch_atime(path.mnt, path.dentry);
@@ -318,8 +320,8 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
        return error;
 }
-asmlinkage long sys_readlink(const char __user *path, char __user *buf,
+SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
-                                int bufsiz)
+                int, bufsiz)
 {
        return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
 }
@@ -365,7 +367,7 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
-asmlinkage long sys_stat64(char __user * filename, struct stat64 __user * statbuf)
+SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_stat(filename, &stat);
@@ -375,7 +377,8 @@ asmlinkage long sys_stat64(char __user * filename, struct stat64 __user * statbu
        return error;
 }
-asmlinkage long sys_lstat64(char __user * filename, struct stat64 __user * statbuf)
+SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_lstat(filename, &stat);
@@ -385,7 +388,8 @@ asmlinkage long sys_lstat64(char __user * filename, struct stat64 __user * statb
        return error;
 }
-asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf)
+SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);
@@ -396,8 +400,8 @@ asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf)
        return error;
 }
-asmlinkage long sys_fstatat64(int dfd, char __user *filename,
+SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
-                               struct stat64 __user *statbuf, int flag)
+                struct stat64 __user *, statbuf, int, flag)
 {
        struct kstat stat;
        int error = -EINVAL;
diff --git a/fs/super.c b/fs/super.c
index ddba069d7a99..645e5403f2a0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
+                INIT_LIST_HEAD(&s->s_async_list);
                init_rwsem(&s->s_umount);
                mutex_init(&s->s_lock);
                lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb)
 {
        const struct super_operations *sop = sb->s_op;
        if (sb->s_root) {
                shrink_dcache_for_umount(sb);
                fsync_super(sb);
                lock_super(sb);
                sb->s_flags &= ~MS_ACTIVE;
+                /*
+                 * wait for asynchronous fs operations to finish before going further
+                 */
+                async_synchronize_full_special(&sb->s_async_list);
                /* bad name - it should be evict_inodes() */
                invalidate_inodes(sb);
                lock_kernel();
@@ -461,6 +470,7 @@ restart:
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
+                async_synchronize_full_special(&sb->s_async_list);
                if (sb->s_root && (wait || sb->s_dirt))
                        sb->s_op->sync_fs(sb, wait);
                up_read(&sb->s_umount);
@@ -534,7 +544,7 @@ rescan:
        return NULL;
 }
-asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 {
        struct super_block *s;
        struct ustat tmp;
@@ -800,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
                }
                s->s_flags |= MS_ACTIVE;
+                bdev->bd_super = s;
        }
        return simple_set_mnt(mnt, s);
@@ -819,6 +830,7 @@ void kill_block_super(struct super_block *sb)
        struct block_device *bdev = sb->s_bdev;
        fmode_t mode = sb->s_mode;
+        bdev->bd_super = 0;
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
        close_bdev_exclusive(bdev, mode);
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416f..a16d53e5fe9d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -36,7 +36,7 @@ static void do_sync(unsigned long wait)
                laptop_sync_completion();
 }
-asmlinkage long sys_sync(void)
+SYSCALL_DEFINE0(sync)
 {
        do_sync(1);
        return 0;
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
        return ret;
 }
-long do_fsync(struct file *file, int datasync)
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:               file to sync
+ * @dentry:             dentry of @file
+ * @data:               only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-        int ret;
+        const struct file_operations *fop;
-        int err;
+        struct address_space *mapping;
-        struct address_space *mapping = file->f_mapping;
+        int err, ret;
-        if (!file->f_op || !file->f_op->fsync) {
+        /*
-                /* Why?  We can still call filemap_fdatawrite */
+         * Get mapping and operations from the file in case we have
+         * as file, or get the default values for them in case we
+         * don't have a struct file available.  Damn nfsd..
+         */
+        if (file) {
+                mapping = file->f_mapping;
+                fop = file->f_op;
+        } else {
+                mapping = dentry->d_inode->i_mapping;
+                fop = dentry->d_inode->i_fop;
+        }
+        if (!fop || !fop->fsync) {
                ret = -EINVAL;
                goto out;
        }
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
         * livelocks in fsync_buffers_list().
         */
        mutex_lock(&mapping->host->i_mutex);
-        err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+        err = fop->fsync(file, dentry, datasync);
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
@@ -104,28 +129,29 @@ long do_fsync(struct file *file, int datasync)
 out:
        return ret;
 }
+EXPORT_SYMBOL(vfs_fsync);
-static long __do_fsync(unsigned int fd, int datasync)
+static int do_fsync(unsigned int fd, int datasync)
 {
        struct file *file;
        int ret = -EBADF;
        file = fget(fd);
        if (file) {
-                ret = do_fsync(file, datasync);
+                ret = vfs_fsync(file, file->f_path.dentry, datasync);
                fput(file);
        }
        return ret;
 }
-asmlinkage long sys_fsync(unsigned int fd)
+SYSCALL_DEFINE1(fsync, unsigned int, fd)
 {
-        return __do_fsync(fd, 0);
+        return do_fsync(fd, 0);
 }
-asmlinkage long sys_fdatasync(unsigned int fd)
+SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 {
-        return __do_fsync(fd, 1);
+        return do_fsync(fd, 1);
 }
 /*
@@ -175,8 +201,8 @@ asmlinkage long sys_fdatasync(unsigned int fd)
 * already-instantiated disk blocks, there are no guarantees here that the data
 * will be available after a crash.
 */
-asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
+SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
-                                        unsigned int flags)
+                                unsigned int flags)
 {
        int ret;
        struct file *file;
@@ -236,14 +262,32 @@ out_put:
 out:
        return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
+                                    long flags)
+{
+        return SYSC_sync_file_range((int) fd, offset, nbytes,
+                                    (unsigned int) flags);
+}
+SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
+#endif
 /* It would be nice if people remember that not all the world's an i386
   when they introduce new system calls */
-asmlinkage long sys_sync_file_range2(int fd, unsigned int flags,
+SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
-                                     loff_t offset, loff_t nbytes)
+                                 loff_t offset, loff_t nbytes)
 {
        return sys_sync_file_range(fd, offset, nbytes, flags);
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_sync_file_range2(long fd, long flags,
+                                     loff_t offset, loff_t nbytes)
+{
+        return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
+                                     offset, nbytes);
+}
+SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
+#endif
 /*
 * `endbyte' is inclusive
@@ -269,7 +313,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
        if (flags & SYNC_FILE_RANGE_WRITE) {
                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-                                                WB_SYNC_NONE);
+                                                WB_SYNC_ALL);
                if (ret < 0)
                        goto out;
        }
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
new file mode 100644
index 000000000000..f4b67588b9d6
--- /dev/null
+++ b/fs/sysfs/Kconfig
@@ -0,0 +1,23 @@
+config SYSFS
+        bool "sysfs file system support" if EMBEDDED
+        default y
+        help
+        The sysfs filesystem is a virtual filesystem that the kernel uses to
+        export internal kernel objects, their attributes, and their
+        relationships to one another.
+        Users can use sysfs to ascertain useful information about the running
+        kernel, such as the devices the kernel has discovered on each bus and
+        which driver each is bound to. sysfs can also be used to tune devices
+        and other kernel subsystems.
+        Some system agents rely on the information in sysfs to operate.
+        /sbin/hotplug uses device and object attributes in sysfs to assist in
+        delegating policy decisions, like persistently naming devices.
+        sysfs is currently used by the block subsystem to mount the root
+        partition.  If sysfs is disabled you must specify the boot device on
+        the kernel boot command line via its major and minor numbers.  For
+        example, "root=03:01" for /dev/hda1.
+        Designers of embedded systems may wish to say N here to conserve space.
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 66f6e58a7e4b..f2c478c3424e 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -63,6 +63,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
        int count = min_t(size_t, bytes, PAGE_SIZE);
        char *temp;
+        if (!bytes)
+                return 0;
        if (size) {
                if (offs > size)
                        return 0;
@@ -131,6 +134,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
        int count = min_t(size_t, bytes, PAGE_SIZE);
        char *temp;
+        if (!bytes)
+                return 0;
        if (size) {
                if (offs > size)
                        return 0;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f856..dfa3d94cfc74 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
        inode->i_mode = mode;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
        struct bin_attribute *bin_attr;
-        inode->i_blocks = 0;
        inode->i_mapping->a_ops = &sysfs_aops;
        inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
        inode->i_op = &sysfs_inode_operations;
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
new file mode 100644
index 000000000000..33aeb4b75db1
--- /dev/null
+++ b/fs/sysv/Kconfig
@@ -0,0 +1,36 @@
+config SYSV_FS
+        tristate "System V/Xenix/V7/Coherent file system support"
+        depends on BLOCK
+        help
+          SCO, Xenix and Coherent are commercial Unix systems for Intel
+          machines, and Version 7 was used on the DEC PDP-11. Saying Y
+          here would allow you to read from their floppies and hard disk
+          partitions.
+          If you have floppies or hard disk partitions like that, it is likely
+          that they contain binaries from those other Unix systems; in order
+          to run these binaries, you will want to install linux-abi which is
+          a set of kernel modules that lets you run SCO, Xenix, Wyse,
+          UnixWare, Dell Unix and System V programs under Linux.  It is
+          available via FTP (user: ftp) from
+          <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
+          NOTE: that will work only for binaries from Intel-based systems;
+          PDP ones will have to wait until somebody ports Linux to -11 ;-)
+          If you only intend to mount files from some other Unix over the
+          network using NFS, you don't need the System V file system support
+          (but you need NFS file system support obviously).
+          Note that this option is generally not needed for floppies, since a
+          good portable way to transport files and directories between unixes
+          (and even other operating systems) is given by the tar program ("man
+          tar" or preferably "info tar").  Note also that this option has
+          nothing whatsoever to do with the option "System V IPC". Read about
+          the System V file system in
+          <file:Documentation/filesystems/sysv-fs.txt>.
+          Saying Y here will enlarge your kernel by about 27 KB.
+          To compile this as a module, choose M here: the module will be called
+          sysv.
+          If you haven't heard about all of this before, it's safe to say N.
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index df0d435baa48..3d81bf58dae2 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -27,6 +27,7 @@
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/namei.h>
 #include <asm/byteorder.h>
 #include "sysv.h"
@@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
                if (inode->i_blocks) {
                        inode->i_op = &sysv_symlink_inode_operations;
                        inode->i_mapping->a_ops = &sysv_aops;
-                } else
+                } else {
                        inode->i_op = &sysv_fast_symlink_inode_operations;
+                        nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+                                sizeof(SYSV_I(inode)->i_data) - 1);
+                }
        } else
                init_special_inode(inode, inode->i_mode, rdev);
 }
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 0862f0e49d0c..6a123b8ff3f5 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -177,7 +177,7 @@ static struct file *timerfd_fget(int fd)
        return file;
 }
-asmlinkage long sys_timerfd_create(int clockid, int flags)
+SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 {
        int ufd;
        struct timerfd_ctx *ctx;
@@ -208,9 +208,9 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
        return ufd;
 }
-asmlinkage long sys_timerfd_settime(int ufd, int flags,
+SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
-                                    const struct itimerspec __user *utmr,
+                const struct itimerspec __user *, utmr,
-                                    struct itimerspec __user *otmr)
+                struct itimerspec __user *, otmr)
 {
        struct file *file;
        struct timerfd_ctx *ctx;
@@ -265,7 +265,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
        return 0;
 }
-asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr)
+SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 {
        struct file *file;
        struct timerfd_ctx *ctx;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 91ceeda7e5bf..e35b54d5059d 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB
        depends on UBIFS_FS
        default y
        help
-          Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+          Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
 # Debugging-related stuff
 config UBIFS_FS_DEBUG
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 4a18f084cc42..f393620890ee 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -32,18 +32,15 @@
 #include "ubifs.h"
 #include <linux/writeback.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
 /*
 * When pessimistic budget calculations say that there is no enough space,
 * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
- * or committing. The below constants define maximum number of times UBIFS
+ * or committing. The below constant defines maximum number of times UBIFS
 * repeats the operations.
 */
-#define MAX_SHRINK_RETRIES 8
+#define MAX_MKSPC_RETRIES 3
-#define MAX_GC_RETRIES     4
-#define MAX_CMT_RETRIES    2
-#define MAX_NOSPC_RETRIES  1
 /*
 * The below constant defines amount of dirty pages which should be written
@@ -52,30 +49,6 @@
 #define NR_TO_WRITE 16
 /**
- * struct retries_info - information about re-tries while making free space.
- * @prev_liability: previous liability
- * @shrink_cnt: how many times the liability was shrinked
- * @shrink_retries: count of liability shrink re-tries (increased when
- *                  liability does not shrink)
- * @try_gc: GC should be tried first
- * @gc_retries: how many times GC was run
- * @cmt_retries: how many times commit has been done
- * @nospc_retries: how many times GC returned %-ENOSPC
- *
- * Since we consider budgeting to be the fast-path, and this structure has to
- * be allocated on stack and zeroed out, we make it smaller using bit-fields.
- */
-struct retries_info {
-        long long prev_liability;
-        unsigned int shrink_cnt;
-        unsigned int shrink_retries:5;
-        unsigned int try_gc:1;
-        unsigned int gc_retries:4;
-        unsigned int cmt_retries:3;
-        unsigned int nospc_retries:1;
-};
-/**
 * shrink_liability - write-back some dirty pages/inodes.
 * @c: UBIFS file-system description object
 * @nr_to_write: how many dirty pages to write-back
@@ -147,13 +120,29 @@ static int run_gc(struct ubifs_info *c)
 }
 /**
+ * get_liability - calculate current liability.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns current UBIFS liability, i.e. the
+ * amount of bytes UBIFS has "promised" to write to the media.
+ */
+static long long get_liability(struct ubifs_info *c)
+{
+        long long liab;
+        spin_lock(&c->space_lock);
+        liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+        spin_unlock(&c->space_lock);
+        return liab;
+}
+/**
 * make_free_space - make more free space on the file-system.
 * @c: UBIFS file-system description object
- * @ri: information about previous invocations of this function
 *
 * This function is called when an operation cannot be budgeted because there
 * is supposedly no free space. But in most cases there is some free space:
- *   o budgeting is pessimistic, so it always budgets more then it is actually
+ *   o budgeting is pessimistic, so it always budgets more than it is actually
 *     needed, so shrinking the liability is one way to make free space - the
 *     cached data will take less space then it was budgeted for;
 *   o GC may turn some dark space into free space (budgeting treats dark space
@@ -165,87 +154,42 @@ static int run_gc(struct ubifs_info *c)
 * Returns %-ENOSPC if it couldn't do more free space, and other negative error
 * codes on failures.
 */
-static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
+static int make_free_space(struct ubifs_info *c)
 {
-        int err;
+        int err, retries = 0;
+        long long liab1, liab2;
-        /*
+        do {
-         * If we have some dirty pages and inodes (liability), try to write
+                liab1 = get_liability(c);
-         * them back unless this was tried too many times without effect
+                /*
-         * already.
+                 * We probably have some dirty pages or inodes (liability), try
-         */
+                 * to write them back.
-        if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
+                 */
-                long long liability;
+                dbg_budg("liability %lld, run write-back", liab1);
+                shrink_liability(c, NR_TO_WRITE);
-                spin_lock(&c->space_lock);
-                liability = c->budg_idx_growth + c->budg_data_growth +
-                            c->budg_dd_growth;
-                spin_unlock(&c->space_lock);
-                if (ri->prev_liability >= liability) {
-                        /* Liability does not shrink, next time try GC then */
-                        ri->shrink_retries += 1;
-                        if (ri->gc_retries < MAX_GC_RETRIES)
-                                ri->try_gc = 1;
-                        dbg_budg("liability did not shrink: retries %d of %d",
-                                 ri->shrink_retries, MAX_SHRINK_RETRIES);
-                }
-                dbg_budg("force write-back (count %d)", ri->shrink_cnt);
-                shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
-                ri->prev_liability = liability;
+                liab2 = get_liability(c);
-                ri->shrink_cnt += 1;
+                if (liab2 < liab1)
-                return -EAGAIN;
+                        return -EAGAIN;
-        }
-        /*
+                dbg_budg("new liability %lld (not shrinked)", liab2);
-         * Try to run garbage collector unless it was already tried too many
-         * times.
-         */
-        if (ri->gc_retries < MAX_GC_RETRIES) {
-                ri->gc_retries += 1;
-                dbg_budg("run GC, retries %d of %d",
-                         ri->gc_retries, MAX_GC_RETRIES);
-                ri->try_gc = 0;
+                /* Liability did not shrink again, try GC */
+                dbg_budg("Run GC");
                err = run_gc(c);
                if (!err)
                        return -EAGAIN;
-                if (err == -EAGAIN) {
+                if (err != -EAGAIN && err != -ENOSPC)
-                        dbg_budg("GC asked to commit");
+                        /* Some real error happened */
-                        err = ubifs_run_commit(c);
-                        if (err)
-                                return err;
-                        return -EAGAIN;
-                }
-                if (err != -ENOSPC)
-                        return err;
-                /*
-                 * GC could not make any progress. If this is the first time,
-                 * then it makes sense to try to commit, because it might make
-                 * some dirty space.
-                 */
-                dbg_budg("GC returned -ENOSPC, retries %d",
-                         ri->nospc_retries);
-                if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
                        return err;
-                ri->nospc_retries += 1;
-        }
-        /* Neither GC nor write-back helped, try to commit */
+                dbg_budg("Run commit (retries %d)", retries);
-        if (ri->cmt_retries < MAX_CMT_RETRIES) {
-                ri->cmt_retries += 1;
-                dbg_budg("run commit, retries %d of %d",
-                         ri->cmt_retries, MAX_CMT_RETRIES);
                err = ubifs_run_commit(c);
                if (err)
                        return err;
-                return -EAGAIN;
+        } while (retries++ < MAX_MKSPC_RETRIES);
-        }
        return -ENOSPC;
 }
@@ -258,8 +202,8 @@ static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
 */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-        int ret;
+        int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
-        uint64_t idx_size;
+        long long idx_size;
        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
@@ -271,23 +215,16 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
         * pair, nor similarly the two variables for the new index size, so we
         * have to do this costly 64-bit division on fast-path.
         */
-        if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
+        idx_size += eff_leb_size - 1;
-                ret = idx_size + 1;
+        idx_lebs = div_u64(idx_size, eff_leb_size);
-        else
-                ret = idx_size;
        /*
         * The index head is not available for the in-the-gaps method, so add an
         * extra LEB to compensate.
         */
-        ret += 1;
+        idx_lebs += 1;
-        /*
+        if (idx_lebs < MIN_INDEX_LEBS)
-         * At present the index needs at least 2 LEBs: one for the index head
+                idx_lebs = MIN_INDEX_LEBS;
-         * and one for in-the-gaps method (which currently does not cater for
+        return idx_lebs;
-         * the index head and so excludes it from consideration).
-         */
-        if (ret < 2)
-                ret = 2;
-        return ret;
 }
 /**
@@ -530,8 +467,7 @@ static int calc_dd_growth(const struct ubifs_info *c,
 int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
        int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
-        int err, idx_growth, data_growth, dd_growth;
+        int err, idx_growth, data_growth, dd_growth, retried = 0;
-        struct retries_info ri;
        ubifs_assert(req->new_page <= 1);
        ubifs_assert(req->dirtied_page <= 1);
@@ -549,7 +485,6 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
        if (!data_growth && !dd_growth)
                return 0;
        idx_growth = calc_idx_growth(c, req);
-        memset(&ri, 0, sizeof(struct retries_info));
 again:
        spin_lock(&c->space_lock);
@@ -587,12 +522,17 @@ again:
                return err;
        }
-        err = make_free_space(c, &ri);
+        err = make_free_space(c);
+        cond_resched();
        if (err == -EAGAIN) {
                dbg_budg("try again");
-                cond_resched();
                goto again;
        } else if (err == -ENOSPC) {
+                if (!retried) {
+                        retried = 1;
+                        dbg_budg("-ENOSPC, but anyway try once again");
+                        goto again;
+                }
                dbg_budg("FS is full, -ENOSPC");
                c->nospace = 1;
                if (can_use_rp(c) || c->rp_size == 0)
@@ -666,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 * @c: UBIFS file-system description object
 *
 * This function converts budget which was allocated for a new page of data to
- * the budget of changing an existing page of data. The latter is smaller then
+ * the budget of changing an existing page of data. The latter is smaller than
 * the former, so this function only does simple re-calculation and does not
 * involve any write-back.
 */
@@ -712,9 +652,9 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 * user-space. User-space application tend to expect that if the file-system
 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
 * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
+ * node and it has to write indexing nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * overhead, and UBIFS has to report slightly less free space to meet the above
- * above expectetion.
+ * expectations.
 *
 * This function assumes free space is made up of uncompressed data nodes and
 * full index nodes (one per data node, tripled because we always allow enough
@@ -723,7 +663,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 * Note, the calculation is pessimistic, which means that most of the time
 * UBIFS reports less space than it actually has.
 */
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 {
        int divisor, factor, f;
@@ -737,7 +677,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
         * of data nodes, f - fanout. Because effective UBIFS fanout is twice
         * as less than maximum fanout, we assume that each data node
         * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
-         * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+         * Note, the multiplier 3 is because UBIFS reserves thrice as more space
         * for the index.
         */
        f = c->fanout > 3 ? c->fanout >> 1 : 2;
@@ -745,45 +685,33 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
        divisor = UBIFS_MAX_DATA_NODE_SZ;
        divisor += (c->max_idx_node_sz * 3) / (f - 1);
        free *= factor;
-        do_div(free, divisor);
+        return div_u64(free, divisor);
-        return free;
 }
 /**
- * ubifs_get_free_space - return amount of free space.
+ * ubifs_get_free_space_nolock - return amount of free space.
 * @c: UBIFS file-system description object
 *
 * This function calculates amount of free space to report to user-space.
 *
 * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * alignment, wastage at the end of eraseblocks, etc), it cannot report real
 * amount of free flash space it has (well, because not all dirty space is
- * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectetion about what free space is. Users seem to
+ * it would bread user expectations about what free space is. Users seem to
 * accustomed to assume that if the file-system reports N bytes of free space,
 * they would be able to fit a file of N bytes to the FS. This almost works for
 * traditional file-systems, because they have way less overhead than UBIFS.
 * So, to keep users happy, UBIFS tries to take the overhead into account.
 */
-long long ubifs_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space_nolock(struct ubifs_info *c)
 {
-        int min_idx_lebs, rsvd_idx_lebs, lebs;
+        int rsvd_idx_lebs, lebs;
        long long available, outstanding, free;
-        spin_lock(&c->space_lock);
+        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
-        min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        available = ubifs_calc_available(c, c->min_idx_lebs);
-        /*
-         * Force the amount available to the total size reported if the used
-         * space is zero.
-         */
-        if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
-                spin_unlock(&c->space_lock);
-                return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
-        }
-        available = ubifs_calc_available(c, min_idx_lebs);
        /*
         * When reporting free space to user-space, UBIFS guarantees that it is
@@ -796,15 +724,14 @@ long long ubifs_get_free_space(struct ubifs_info *c)
         * Note, the calculations below are similar to what we have in
         * 'do_budget_space()', so refer there for comments.
         */
-        if (min_idx_lebs > c->lst.idx_lebs)
+        if (c->min_idx_lebs > c->lst.idx_lebs)
-                rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+                rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
        else
                rsvd_idx_lebs = 0;
        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
               c->lst.taken_empty_lebs;
        lebs -= rsvd_idx_lebs;
        available += lebs * (c->dark_wm - c->leb_overhead);
-        spin_unlock(&c->space_lock);
        if (available > outstanding)
                free = ubifs_reported_space(c, available - outstanding);
@@ -812,3 +739,21 @@ long long ubifs_get_free_space(struct ubifs_info *c)
                free = 0;
        return free;
 }
+/**
+ * ubifs_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and retuns amount of free space to report to
+ * user-space.
+ */
+long long ubifs_get_free_space(struct ubifs_info *c)
+{
+        long long free;
+        spin_lock(&c->space_lock);
+        free = ubifs_get_free_space_nolock(c);
+        spin_unlock(&c->space_lock);
+        return free;
+}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index b49884c8c10e..f3a7945527fb 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -470,12 +470,12 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 {
        struct ubifs_idx_node *idx;
        int lnum, offs, len, err = 0;
+        struct ubifs_debug_info *d = c->dbg;
-        c->old_zroot = *zroot;
+        d->old_zroot = *zroot;
+        lnum = d->old_zroot.lnum;
-        lnum = c->old_zroot.lnum;
+        offs = d->old_zroot.offs;
-        offs = c->old_zroot.offs;
+        len = d->old_zroot.len;
-        len = c->old_zroot.len;
        idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
        if (!idx)
@@ -485,8 +485,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
        if (err)
                goto out;
-        c->old_zroot_level = le16_to_cpu(idx->level);
+        d->old_zroot_level = le16_to_cpu(idx->level);
-        c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+        d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
 out:
        kfree(idx);
        return err;
@@ -509,6 +509,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 {
        int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
        int first = 1, iip;
+        struct ubifs_debug_info *d = c->dbg;
        union ubifs_key lower_key, upper_key, l_key, u_key;
        unsigned long long uninitialized_var(last_sqnum);
        struct ubifs_idx_node *idx;
@@ -525,9 +526,9 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
             UBIFS_IDX_NODE_SZ;
        /* Start at the old zroot */
-        lnum = c->old_zroot.lnum;
+        lnum = d->old_zroot.lnum;
-        offs = c->old_zroot.offs;
+        offs = d->old_zroot.offs;
-        len = c->old_zroot.len;
+        len = d->old_zroot.len;
        iip = 0;
        /*
@@ -560,11 +561,11 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
                if (first) {
                        first = 0;
                        /* Check root level and sqnum */
-                        if (le16_to_cpu(idx->level) != c->old_zroot_level) {
+                        if (le16_to_cpu(idx->level) != d->old_zroot_level) {
                                err = 2;
                                goto out_dump;
                        }
-                        if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
+                        if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) {
                                err = 3;
                                goto out_dump;
                        }
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index a0ada596b17c..11e4132f314a 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -33,7 +33,7 @@
 /* Fake description object for the "none" compressor */
 static struct ubifs_compressor none_compr = {
        .compr_type = UBIFS_COMPR_NONE,
-        .name = "no compression",
+        .name = "none",
        .capi_name = "",
 };
@@ -43,13 +43,13 @@ static DEFINE_MUTEX(lzo_mutex);
 static struct ubifs_compressor lzo_compr = {
        .compr_type = UBIFS_COMPR_LZO,
        .comp_mutex = &lzo_mutex,
-        .name = "LZO",
+        .name = "lzo",
        .capi_name = "lzo",
 };
 #else
 static struct ubifs_compressor lzo_compr = {
        .compr_type = UBIFS_COMPR_LZO,
-        .name = "LZO",
+        .name = "lzo",
 };
 #endif
@@ -108,7 +108,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
        if (compr->comp_mutex)
                mutex_lock(compr->comp_mutex);
        err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
-                                   out_len);
+                                   (unsigned int *)out_len);
        if (compr->comp_mutex)
                mutex_unlock(compr->comp_mutex);
        if (unlikely(err)) {
@@ -119,10 +119,10 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
        }
        /*
-         * Presently, we just require that compression results in less data,
+         * If the data compressed only slightly, it is better to leave it
-         * rather than any defined minimum compression ratio or amount.
+         * uncompressed to improve read speed.
         */
-        if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
+        if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
                goto no_compr;
        return;
@@ -172,7 +172,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
        if (compr->decomp_mutex)
                mutex_lock(compr->decomp_mutex);
        err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
-                                     out_len);
+                                     (unsigned int *)out_len);
        if (compr->decomp_mutex)
                mutex_unlock(compr->decomp_mutex);
        if (err)
@@ -244,7 +244,7 @@ out_lzo:
 /**
 * ubifs_compressors_exit - de-initialize UBIFS compressors.
 */
-void __exit ubifs_compressors_exit(void)
+void ubifs_compressors_exit(void)
 {
        compr_exit(&lzo_compr);
        compr_exit(&zlib_compr);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 510ffa0bbda4..e975bd82f38b 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -32,6 +32,8 @@
 #include "ubifs.h"
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/math64.h>
 #ifdef CONFIG_UBIFS_FS_DEBUG
@@ -596,7 +598,9 @@ void dbg_dump_budg(struct ubifs_info *c)
        struct rb_node *rb;
        struct ubifs_bud *bud;
        struct ubifs_gced_idx_leb *idx_gc;
+        long long available, outstanding, free;
+        ubifs_assert(spin_is_locked(&c->space_lock));
        spin_lock(&dbg_lock);
        printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
               "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
@@ -616,9 +620,11 @@ void dbg_dump_budg(struct ubifs_info *c)
               c->dark_wm, c->dead_wm, c->max_idx_node_sz);
        printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
               c->gc_lnum, c->ihead_lnum);
-        for (i = 0; i < c->jhead_cnt; i++)
+        /* If we are in R/O mode, journal heads do not exist */
-                printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
+        if (c->jheads)
-                       c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+                for (i = 0; i < c->jhead_cnt; i++)
+                        printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
+                               c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
        for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
                bud = rb_entry(rb, struct ubifs_bud, rb);
                printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -629,6 +635,14 @@ void dbg_dump_budg(struct ubifs_info *c)
                printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
                       idx_gc->lnum, idx_gc->unmap);
        printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+        /* Print budgeting predictions */
+        available = ubifs_calc_available(c, c->min_idx_lebs);
+        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        free = ubifs_get_free_space_nolock(c);
+        printk(KERN_DEBUG "Budgeting predictions:\n");
+        printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
+               available, outstanding, free);
        spin_unlock(&dbg_lock);
 }
@@ -645,7 +659,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
        struct ubifs_lprops lp;
        struct ubifs_lp_stats lst;
-        printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
+        printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
+               current->pid);
        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
@@ -656,6 +671,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
                dbg_dump_lprop(c, &lp);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
+               current->pid);
 }
 void dbg_dump_lpt_info(struct ubifs_info *c)
@@ -663,6 +680,7 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
        int i;
        spin_lock(&dbg_lock);
+        printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
        printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz);
        printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz);
        printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz);
@@ -684,7 +702,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
        printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
        printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
               c->nhead_lnum, c->nhead_offs);
-        printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
+        printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
+               c->ltab_lnum, c->ltab_offs);
        if (c->big_lpt)
                printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
                       c->lsave_lnum, c->lsave_offs);
@@ -703,9 +722,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
        if (dbg_failure_mode)
                return;
-        printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
+        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+               current->pid, lnum);
-        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
        if (IS_ERR(sleb)) {
                ubifs_err("scan error %d", (int)PTR_ERR(sleb));
                return;
@@ -721,6 +740,8 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
                dbg_dump_node(c, snod->node);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+               current->pid, lnum);
        ubifs_scan_destroy(sleb);
        return;
 }
@@ -768,7 +789,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
        int i;
-        printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
+        printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
               current->pid, cat, heap->cnt);
        for (i = 0; i < heap->cnt; i++) {
                struct ubifs_lprops *lprops = heap->arr[i];
@@ -777,6 +798,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
                       "flags %d\n", i, lprops->lnum, lprops->hpos,
                       lprops->free, lprops->dirty, lprops->flags);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
 }
 void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -784,7 +806,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 {
        int i;
-        printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
+        printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
        printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
               (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
        printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -803,7 +825,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
        int level;
        printk(KERN_DEBUG "\n");
-        printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
+        printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
        znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
        level = znode->level;
        printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -815,8 +837,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
                dbg_dump_znode(c, znode);
                znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
-        printk(KERN_DEBUG "\n");
 }
 static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
@@ -839,6 +860,65 @@ void dbg_dump_index(struct ubifs_info *c)
 }
 /**
+ * dbg_save_space_info - save information about flash space.
+ * @c: UBIFS file-system description object
+ *
+ * This function saves information about UBIFS free space, dirty space, etc, in
+ * order to check it later.
+ */
+void dbg_save_space_info(struct ubifs_info *c)
+{
+        struct ubifs_debug_info *d = c->dbg;
+        ubifs_get_lp_stats(c, &d->saved_lst);
+        spin_lock(&c->space_lock);
+        d->saved_free = ubifs_get_free_space_nolock(c);
+        spin_unlock(&c->space_lock);
+}
+/**
+ * dbg_check_space_info - check flash space information.
+ * @c: UBIFS file-system description object
+ *
+ * This function compares current flash space information with the information
+ * which was saved when the 'dbg_save_space_info()' function was called.
+ * Returns zero if the information has not changed, and %-EINVAL it it has
+ * changed.
+ */
+int dbg_check_space_info(struct ubifs_info *c)
+{
+        struct ubifs_debug_info *d = c->dbg;
+        struct ubifs_lp_stats lst;
+        long long avail, free;
+        spin_lock(&c->space_lock);
+        avail = ubifs_calc_available(c, c->min_idx_lebs);
+        spin_unlock(&c->space_lock);
+        free = ubifs_get_free_space(c);
+        if (free != d->saved_free) {
+                ubifs_err("free space changed from %lld to %lld",
+                          d->saved_free, free);
+                goto out;
+        }
+        return 0;
+out:
+        ubifs_msg("saved lprops statistics dump");
+        dbg_dump_lstats(&d->saved_lst);
+        ubifs_get_lp_stats(c, &lst);
+        ubifs_msg("current lprops statistics dump");
+        dbg_dump_lstats(&d->saved_lst);
+        spin_lock(&c->space_lock);
+        dbg_dump_budg(c);
+        spin_unlock(&c->space_lock);
+        dump_stack();
+        return -EINVAL;
+}
+/**
 * dbg_check_synced_i_size - check synchronized inode size.
 * @inode: inode to check
 *
@@ -992,8 +1072,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                        zbr1->offs, DBGKEY(&key));
                dbg_err("but it should have key %s according to tnc",
                        DBGKEY(&zbr1->key));
-                        dbg_dump_node(c, dent1);
+                dbg_dump_node(c, dent1);
-                        goto out_free;
+                goto out_free;
        }
        key_read(c, &dent2->key, &key);
@@ -1002,8 +1082,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                        zbr1->offs, DBGKEY(&key));
                dbg_err("but it should have key %s according to tnc",
                        DBGKEY(&zbr2->key));
-                        dbg_dump_node(c, dent2);
+                dbg_dump_node(c, dent2);
-                        goto out_free;
+                goto out_free;
        }
        nlen1 = le16_to_cpu(dent1->nlen);
@@ -1020,9 +1100,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                dbg_err("bad order of colliding key %s",
                        DBGKEY(&key));
-        dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+        ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
        dbg_dump_node(c, dent1);
-        dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+        ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
        dbg_dump_node(c, dent2);
 out_free:
@@ -1327,7 +1407,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
 * @c: UBIFS file-system description object
 * @leaf_cb: called for each leaf node
 * @znode_cb: called for each indexing node
- * @priv: private date which is passed to callbacks
+ * @priv: private data which is passed to callbacks
 *
 * This function walks the UBIFS index and calls the @leaf_cb for each leaf
 * node and @znode_cb for each indexing node. Returns zero in case of success
@@ -2097,13 +2177,13 @@ static int simple_rand(void)
        return (next >> 16) & 32767;
 }
-void dbg_failure_mode_registration(struct ubifs_info *c)
+static void failure_mode_init(struct ubifs_info *c)
 {
        struct failure_mode_info *fmi;
        fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
        if (!fmi) {
-                dbg_err("Failed to register failure mode - no memory");
+                ubifs_err("Failed to register failure mode - no memory");
                return;
        }
        fmi->c = c;
@@ -2112,7 +2192,7 @@ void dbg_failure_mode_registration(struct ubifs_info *c)
        spin_unlock(&fmi_lock);
 }
-void dbg_failure_mode_deregistration(struct ubifs_info *c)
+static void failure_mode_exit(struct ubifs_info *c)
 {
        struct failure_mode_info *fmi, *tmp;
@@ -2146,42 +2226,44 @@ static int in_failure_mode(struct ubi_volume_desc *desc)
        struct ubifs_info *c = dbg_find_info(desc);
        if (c && dbg_failure_mode)
-                return c->failure_mode;
+                return c->dbg->failure_mode;
        return 0;
 }
 static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
 {
        struct ubifs_info *c = dbg_find_info(desc);
+        struct ubifs_debug_info *d;
        if (!c || !dbg_failure_mode)
                return 0;
-        if (c->failure_mode)
+        d = c->dbg;
+        if (d->failure_mode)
                return 1;
-        if (!c->fail_cnt) {
+        if (!d->fail_cnt) {
                /* First call - decide delay to failure */
                if (chance(1, 2)) {
                        unsigned int delay = 1 << (simple_rand() >> 11);
                        if (chance(1, 2)) {
-                                c->fail_delay = 1;
+                                d->fail_delay = 1;
-                                c->fail_timeout = jiffies +
+                                d->fail_timeout = jiffies +
                                                  msecs_to_jiffies(delay);
                                dbg_rcvry("failing after %ums", delay);
                        } else {
-                                c->fail_delay = 2;
+                                d->fail_delay = 2;
-                                c->fail_cnt_max = delay;
+                                d->fail_cnt_max = delay;
                                dbg_rcvry("failing after %u calls", delay);
                        }
                }
-                c->fail_cnt += 1;
+                d->fail_cnt += 1;
        }
        /* Determine if failure delay has expired */
-        if (c->fail_delay == 1) {
+        if (d->fail_delay == 1) {
-                if (time_before(jiffies, c->fail_timeout))
+                if (time_before(jiffies, d->fail_timeout))
                        return 0;
-        } else if (c->fail_delay == 2)
+        } else if (d->fail_delay == 2)
-                if (c->fail_cnt++ < c->fail_cnt_max)
+                if (d->fail_cnt++ < d->fail_cnt_max)
                        return 0;
        if (lnum == UBIFS_SB_LNUM) {
                if (write) {
@@ -2239,7 +2321,7 @@ static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
                dbg_rcvry("failing in bud LEB %d commit not running", lnum);
        }
        ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
-        c->failure_mode = 1;
+        d->failure_mode = 1;
        dump_stack();
        return 1;
 }
@@ -2344,4 +2426,177 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
        return 0;
 }
+/**
+ * ubifs_debugging_init - initialize UBIFS debugging.
+ * @c: UBIFS file-system description object
+ *
+ * This function initializes debugging-related data for the file system.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_debugging_init(struct ubifs_info *c)
+{
+        c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
+        if (!c->dbg)
+                return -ENOMEM;
+        c->dbg->buf = vmalloc(c->leb_size);
+        if (!c->dbg->buf)
+                goto out;
+        failure_mode_init(c);
+        return 0;
+out:
+        kfree(c->dbg);
+        return -ENOMEM;
+}
+/**
+ * ubifs_debugging_exit - free debugging data.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_debugging_exit(struct ubifs_info *c)
+{
+        failure_mode_exit(c);
+        vfree(c->dbg->buf);
+        kfree(c->dbg);
+}
+/*
+ * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
+ * contain the stuff specific to particular file-system mounts.
+ */
+static struct dentry *dfs_rootdir;
+/**
+ * dbg_debugfs_init - initialize debugfs file-system.
+ *
+ * UBIFS uses debugfs file-system to expose various debugging knobs to
+ * user-space. This function creates "ubifs" directory in the debugfs
+ * file-system. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int dbg_debugfs_init(void)
+{
+        dfs_rootdir = debugfs_create_dir("ubifs", NULL);
+        if (IS_ERR(dfs_rootdir)) {
+                int err = PTR_ERR(dfs_rootdir);
+                ubifs_err("cannot create \"ubifs\" debugfs directory, "
+                          "error %d\n", err);
+                return err;
+        }
+        return 0;
+}
+/**
+ * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
+ */
+void dbg_debugfs_exit(void)
+{
+        debugfs_remove(dfs_rootdir);
+}
+static int open_debugfs_file(struct inode *inode, struct file *file)
+{
+        file->private_data = inode->i_private;
+        return 0;
+}
+static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos)
+{
+        struct ubifs_info *c = file->private_data;
+        struct ubifs_debug_info *d = c->dbg;
+        if (file->f_path.dentry == d->dfs_dump_lprops)
+                dbg_dump_lprops(c);
+        else if (file->f_path.dentry == d->dfs_dump_budg) {
+                spin_lock(&c->space_lock);
+                dbg_dump_budg(c);
+                spin_unlock(&c->space_lock);
+        } else if (file->f_path.dentry == d->dfs_dump_tnc) {
+                mutex_lock(&c->tnc_mutex);
+                dbg_dump_tnc(c);
+                mutex_unlock(&c->tnc_mutex);
+        } else
+                return -EINVAL;
+        *ppos += count;
+        return count;
+}
+static const struct file_operations dfs_fops = {
+        .open = open_debugfs_file,
+        .write = write_debugfs_file,
+        .owner = THIS_MODULE,
+};
+/**
+ * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates all debugfs files for this instance of UBIFS. Returns
+ * zero in case of success and a negative error code in case of failure.
+ *
+ * Note, the only reason we have not merged this function with the
+ * 'ubifs_debugging_init()' function is because it is better to initialize
+ * debugfs interfaces at the very end of the mount process, and remove them at
+ * the very beginning of the mount process.
+ */
+int dbg_debugfs_init_fs(struct ubifs_info *c)
+{
+        int err;
+        const char *fname;
+        struct dentry *dent;
+        struct ubifs_debug_info *d = c->dbg;
+        sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+        d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
+        if (IS_ERR(d->dfs_dir)) {
+                err = PTR_ERR(d->dfs_dir);
+                ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+                          d->dfs_dir_name, err);
+                goto out;
+        }
+        fname = "dump_lprops";
+        dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+        if (IS_ERR(dent))
+                goto out_remove;
+        d->dfs_dump_lprops = dent;
+        fname = "dump_budg";
+        dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+        if (IS_ERR(dent))
+                goto out_remove;
+        d->dfs_dump_budg = dent;
+        fname = "dump_tnc";
+        dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+        if (IS_ERR(dent))
+                goto out_remove;
+        d->dfs_dump_tnc = dent;
+        return 0;
+out_remove:
+        err = PTR_ERR(dent);
+        ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+                  fname, err);
+        debugfs_remove_recursive(d->dfs_dir);
+out:
+        return err;
+}
+/**
+ * dbg_debugfs_exit_fs - remove all debugfs files.
+ * @c: UBIFS file-system description object
+ */
+void dbg_debugfs_exit_fs(struct ubifs_info *c)
+{
+        debugfs_remove_recursive(c->dbg->dfs_dir);
+}
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 33d6b95071e4..c1cd73b2e06e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -25,7 +25,61 @@
 #ifdef CONFIG_UBIFS_FS_DEBUG
-#define UBIFS_DBG(op) op
+/**
+ * ubifs_debug_info - per-FS debugging information.
+ * @buf: a buffer of LEB size, used for various purposes
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ * @chk_lpt_sz: used by LPT tree size checker
+ * @chk_lpt_sz2: used by LPT tree size checker
+ * @chk_lpt_wastage: used by LPT tree size checker
+ * @chk_lpt_lebs: used by LPT tree size checker
+ * @new_nhead_offs: used by LPT tree size checker
+ * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
+ * @new_ihead_offs: used by debugging to check @c->ihead_offs
+ *
+ * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
+ * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ *
+ * dfs_dir_name: name of debugfs directory containing this file-system's files
+ * dfs_dir: direntry object of the file-system debugfs directory
+ * dfs_dump_lprops: "dump lprops" debugfs knob
+ * dfs_dump_budg: "dump budgeting information" debugfs knob
+ * dfs_dump_tnc: "dump TNC" debugfs knob
+ */
+struct ubifs_debug_info {
+        void *buf;
+        struct ubifs_zbranch old_zroot;
+        int old_zroot_level;
+        unsigned long long old_zroot_sqnum;
+        int failure_mode;
+        int fail_delay;
+        unsigned long fail_timeout;
+        unsigned int fail_cnt;
+        unsigned int fail_cnt_max;
+        long long chk_lpt_sz;
+        long long chk_lpt_sz2;
+        long long chk_lpt_wastage;
+        int chk_lpt_lebs;
+        int new_nhead_offs;
+        int new_ihead_lnum;
+        int new_ihead_offs;
+        struct ubifs_lp_stats saved_lst;
+        long long saved_free;
+        char dfs_dir_name[100];
+        struct dentry *dfs_dir;
+        struct dentry *dfs_dump_lprops;
+        struct dentry *dfs_dump_budg;
+        struct dentry *dfs_dump_tnc;
+};
 #define ubifs_assert(expr) do {                                                \
        if (unlikely(!(expr))) {                                               \
@@ -211,14 +265,18 @@ extern unsigned int ubifs_msg_flags;
 extern unsigned int ubifs_chk_flags;
 extern unsigned int ubifs_tst_flags;
-/* Dump functions */
+int ubifs_debugging_init(struct ubifs_info *c);
+void ubifs_debugging_exit(struct ubifs_info *c);
+/* Dump functions */
 const char *dbg_ntype(int type);
 const char *dbg_cstate(int cmt_state);
 const char *dbg_get_key_dump(const struct ubifs_info *c,
                             const union ubifs_key *key);
 void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
 void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
+                       int offs);
 void dbg_dump_budget_req(const struct ubifs_budget_req *req);
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
 void dbg_dump_budg(struct ubifs_info *c);
@@ -233,9 +291,9 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
                    struct ubifs_nnode *parent, int iip);
 void dbg_dump_tnc(struct ubifs_info *c);
 void dbg_dump_index(struct ubifs_info *c);
+void dbg_dump_lpt_lebs(const struct ubifs_info *c);
 /* Checking helper functions */
 typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
                                 struct ubifs_zbranch *zbr, void *priv);
 typedef int (*dbg_znode_callback)(struct ubifs_info *c,
@@ -244,7 +302,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
                   dbg_znode_callback znode_cb, void *priv);
 /* Checking functions */
+void dbg_save_space_info(struct ubifs_info *c);
+int dbg_check_space_info(struct ubifs_info *c);
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
@@ -274,9 +333,6 @@ int dbg_force_in_the_gaps(void);
 #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
-void dbg_failure_mode_registration(struct ubifs_info *c);
-void dbg_failure_mode_deregistration(struct ubifs_info *c);
 #ifndef UBIFS_DBG_PRESERVE_UBI
 #define ubi_leb_read   dbg_leb_read
@@ -318,9 +374,13 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
        return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
 }
-#else /* !CONFIG_UBIFS_FS_DEBUG */
+/* Debugfs-related stuff */
+int dbg_debugfs_init(void);
+void dbg_debugfs_exit(void);
+int dbg_debugfs_init_fs(struct ubifs_info *c);
+void dbg_debugfs_exit_fs(struct ubifs_info *c);
-#define UBIFS_DBG(op)
+#else /* !CONFIG_UBIFS_FS_DEBUG */
 /* Use "if (0)" to make compiler check arguments even if debugging is off */
 #define ubifs_assert(expr)  do {                                               \
@@ -360,26 +420,33 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
-#define dbg_ntype(type)                       ""
+#define ubifs_debugging_init(c)                0
-#define dbg_cstate(cmt_state)                 ""
+#define ubifs_debugging_exit(c)                ({})
-#define dbg_get_key_dump(c, key)              ({})
-#define dbg_dump_inode(c, inode)              ({})
+#define dbg_ntype(type)                        ""
-#define dbg_dump_node(c, node)                ({})
+#define dbg_cstate(cmt_state)                  ""
-#define dbg_dump_budget_req(req)              ({})
+#define dbg_get_key_dump(c, key)               ({})
-#define dbg_dump_lstats(lst)                  ({})
+#define dbg_dump_inode(c, inode)               ({})
-#define dbg_dump_budg(c)                      ({})
+#define dbg_dump_node(c, node)                 ({})
-#define dbg_dump_lprop(c, lp)                 ({})
+#define dbg_dump_lpt_node(c, node, lnum, offs) ({})
-#define dbg_dump_lprops(c)                    ({})
+#define dbg_dump_budget_req(req)               ({})
-#define dbg_dump_lpt_info(c)                  ({})
+#define dbg_dump_lstats(lst)                   ({})
-#define dbg_dump_leb(c, lnum)                 ({})
+#define dbg_dump_budg(c)                       ({})
-#define dbg_dump_znode(c, znode)              ({})
+#define dbg_dump_lprop(c, lp)                  ({})
-#define dbg_dump_heap(c, heap, cat)           ({})
+#define dbg_dump_lprops(c)                     ({})
-#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_lpt_info(c)                   ({})
-#define dbg_dump_tnc(c)                       ({})
+#define dbg_dump_leb(c, lnum)                  ({})
-#define dbg_dump_index(c)                     ({})
+#define dbg_dump_znode(c, znode)               ({})
+#define dbg_dump_heap(c, heap, cat)            ({})
+#define dbg_dump_pnode(c, pnode, parent, iip)  ({})
+#define dbg_dump_tnc(c)                        ({})
+#define dbg_dump_index(c)                      ({})
+#define dbg_dump_lpt_lebs(c)                   ({})
 #define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
+#define dbg_save_space_info(c)                     ({})
+#define dbg_check_space_info(c)                    0
 #define dbg_check_old_index(c, zroot)              0
 #define dbg_check_cats(c)                          0
 #define dbg_check_ltab(c)                          0
@@ -396,9 +463,11 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
-#define dbg_failure_mode_registration(c)           ({})
-#define dbg_failure_mode_deregistration(c)         ({})
-#endif /* !CONFIG_UBIFS_FS_DEBUG */
+#define dbg_debugfs_init()                         0
+#define dbg_debugfs_exit()
+#define dbg_debugfs_init_fs(c)                     0
+#define dbg_debugfs_exit_fs(c)                     0
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
 #endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f448ab1f9c38..f55d523c52bb 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -482,30 +482,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
 }
 /**
- * lock_2_inodes - lock two UBIFS inodes.
+ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
 * @inode1: first inode
 * @inode2: second inode
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
 */
 static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
 {
-        if (inode1->i_ino < inode2->i_ino) {
+        mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
-                mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
+        mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
-                mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
-        } else {
-                mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
-                mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
-        }
 }
 /**
- * unlock_2_inodes - unlock two UBIFS inodes inodes.
+ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
 * @inode1: first inode
 * @inode2: second inode
 */
 static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
 {
-        mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
        mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+        mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 }
 static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
@@ -527,6 +526,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
                dentry->d_name.len, dentry->d_name.name, inode->i_ino,
                inode->i_nlink, dir->i_ino);
+        ubifs_assert(mutex_is_locked(&dir->i_mutex));
+        ubifs_assert(mutex_is_locked(&inode->i_mutex));
        err = dbg_check_synced_i_size(inode);
        if (err)
                return err;
@@ -580,6 +581,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
        dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
                dentry->d_name.len, dentry->d_name.name, inode->i_ino,
                inode->i_nlink, dir->i_ino);
+        ubifs_assert(mutex_is_locked(&dir->i_mutex));
+        ubifs_assert(mutex_is_locked(&inode->i_mutex));
        err = dbg_check_synced_i_size(inode);
        if (err)
                return err;
@@ -667,7 +670,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
        dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
                dentry->d_name.name, inode->i_ino, dir->i_ino);
+        ubifs_assert(mutex_is_locked(&dir->i_mutex));
+        ubifs_assert(mutex_is_locked(&inode->i_mutex));
        err = check_dir_empty(c, dentry->d_inode);
        if (err)
                return err;
@@ -922,59 +926,30 @@ out_budg:
 }
 /**
- * lock_3_inodes - lock three UBIFS inodes for rename.
+ * lock_3_inodes - a wrapper for locking three UBIFS inodes.
 * @inode1: first inode
 * @inode2: second inode
 * @inode3: third inode
 *
- * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
+ * This function is used for 'ubifs_rename()' and @inode1 may be the same as
- * be null.
+ * @inode2 whereas @inode3 may be %NULL.
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
 */
 static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
                          struct inode *inode3)
 {
-        struct inode *i1, *i2, *i3;
+        mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+        if (inode2 != inode1)
-        if (!inode3) {
+                mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
-                if (inode1 != inode2) {
+        if (inode3)
-                        lock_2_inodes(inode1, inode2);
+                mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
-                        return;
-                }
-                mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
-                return;
-        }
-        if (inode1 == inode2) {
-                lock_2_inodes(inode1, inode3);
-                return;
-        }
-        /* 3 different inodes */
-        if (inode1 < inode2) {
-                i3 = inode2;
-                if (inode1 < inode3) {
-                        i1 = inode1;
-                        i2 = inode3;
-                } else {
-                        i1 = inode3;
-                        i2 = inode1;
-                }
-        } else {
-                i3 = inode1;
-                if (inode2 < inode3) {
-                        i1 = inode2;
-                        i2 = inode3;
-                } else {
-                        i1 = inode3;
-                        i2 = inode2;
-                }
-        }
-        mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
-        lock_2_inodes(i2, i3);
 }
 /**
- * unlock_3_inodes - unlock three UBIFS inodes for rename.
+ * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
 * @inode1: first inode
 * @inode2: second inode
 * @inode3: third inode
@@ -982,11 +957,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
 static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
                            struct inode *inode3)
 {
-        mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
-        if (inode1 != inode2)
-                mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
        if (inode3)
                mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+        if (inode1 != inode2)
+                mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+        mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 }
 static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -1020,6 +995,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
                "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
                old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
                new_dentry->d_name.name, new_dir->i_ino);
+        ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
+        ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+        if (unlink)
+                ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
        if (unlink && is_dir) {
                err = check_dir_empty(c, new_inode);
@@ -1199,7 +1179,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
-struct inode_operations ubifs_dir_inode_operations = {
+const struct inode_operations ubifs_dir_inode_operations = {
        .lookup      = ubifs_lookup,
        .create      = ubifs_create,
        .link        = ubifs_link,
@@ -1219,7 +1199,7 @@ struct inode_operations ubifs_dir_inode_operations = {
 #endif
 };
-struct file_operations ubifs_dir_operations = {
+const struct file_operations ubifs_dir_operations = {
        .llseek         = ubifs_dir_llseek,
        .release        = ubifs_dir_release,
        .read           = generic_read_dir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2624411d9758..93b6de51f261 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
                return err;
        }
-        ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum);
+        ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+                     ubifs_inode(inode)->creat_sqnum);
        len = le32_to_cpu(dn->size);
        if (len <= 0 || len > UBIFS_BLOCK_SIZE)
                goto dump;
@@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
 }
 static int write_begin_slow(struct address_space *mapping,
-                            loff_t pos, unsigned len, struct page **pagep)
+                            loff_t pos, unsigned len, struct page **pagep,
+                            unsigned flags)
 {
        struct inode *inode = mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -247,14 +248,14 @@ static int write_begin_slow(struct address_space *mapping,
        if (unlikely(err))
                return err;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (unlikely(!page)) {
                ubifs_release_budget(c, &req);
                return -ENOMEM;
        }
        if (!PageUptodate(page)) {
-                if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
                        SetPageChecked(page);
                else {
                        err = do_readpage(page);
@@ -431,20 +432,19 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
        struct page *page;
        ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
        if (unlikely(c->ro_media))
                return -EROFS;
        /* Try out the fast-path part first */
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (unlikely(!page))
                return -ENOMEM;
        if (!PageUptodate(page)) {
                /* The page is not loaded from the flash */
-                if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
                        /*
                         * We change whole page so no need to load it. But we
                         * have to set the @PG_checked flag to make the further
@@ -483,7 +483,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
-                return write_begin_slow(mapping, pos, len, pagep);
+                return write_begin_slow(mapping, pos, len, pagep, flags);
        }
        /*
@@ -1540,7 +1540,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-struct address_space_operations ubifs_file_address_operations = {
+const struct address_space_operations ubifs_file_address_operations = {
        .readpage       = ubifs_readpage,
        .writepage      = ubifs_writepage,
        .write_begin    = ubifs_write_begin,
@@ -1550,7 +1550,7 @@ struct address_space_operations ubifs_file_address_operations = {
        .releasepage    = ubifs_releasepage,
 };
-struct inode_operations ubifs_file_inode_operations = {
+const struct inode_operations ubifs_file_inode_operations = {
        .setattr     = ubifs_setattr,
        .getattr     = ubifs_getattr,
 #ifdef CONFIG_UBIFS_FS_XATTR
@@ -1561,14 +1561,14 @@ struct inode_operations ubifs_file_inode_operations = {
 #endif
 };
-struct inode_operations ubifs_symlink_inode_operations = {
+const struct inode_operations ubifs_symlink_inode_operations = {
        .readlink    = generic_readlink,
        .follow_link = ubifs_follow_link,
        .setattr     = ubifs_setattr,
        .getattr     = ubifs_getattr,
 };
-struct file_operations ubifs_file_operations = {
+const struct file_operations ubifs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 0bef6501d58a..a711d33b3d3e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -31,6 +31,26 @@
 * to be reused. Garbage collection will cause the number of dirty index nodes
 * to grow, however sufficient space is reserved for the index to ensure the
 * commit will never run out of space.
+ *
+ * Notes about dead watermark. At current UBIFS implementation we assume that
+ * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
+ * and not worth garbage-collecting. The dead watermark is one min. I/O unit
+ * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
+ * Garbage Collector has to synchronize the GC head's write buffer before
+ * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
+ * actually reclaim even very small pieces of dirty space by garbage collecting
+ * enough dirty LEBs, but we do not bother doing this at this implementation.
+ *
+ * Notes about dark watermark. The results of GC work depends on how big are
+ * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
+ * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
+ * have to waste large pieces of free space at the end of LEB B, because nodes
+ * from LEB A would not fit. And the worst situation is when all nodes are of
+ * maximum size. So dark watermark is the amount of free + dirty space in LEB
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * be unable to reclaim it. So, LEBs with free + dirty greater than dark
+ * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
+ * good, and GC takes extra care when moving them.
 */
 #include <linux/pagemap.h>
@@ -45,7 +65,7 @@
 #define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
 /*
- * GC may need to move more then one LEB to make progress. The below constants
+ * GC may need to move more than one LEB to make progress. The below constants
 * define "soft" and "hard" limits on the number of LEBs the garbage collector
 * may move.
 */
@@ -381,7 +401,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
                /*
                 * Don't release the LEB until after the next commit, because
-                 * it may contain date which is needed for recovery. So
+                 * it may contain data which is needed for recovery. So
                 * although we freed this LEB, it will become usable only after
                 * the commit.
                 */
@@ -810,8 +830,9 @@ out:
 * ubifs_destroy_idx_gc - destroy idx_gc list.
 * @c: UBIFS file-system description object
 *
- * This function destroys the idx_gc list. It is called when unmounting or
+ * This function destroys the @c->idx_gc list. It is called when unmounting
- * remounting read-only so locks are not needed.
+ * so locks are not needed. Returns zero in case of success and a negative
+ * error code in case of failure.
 */
 void ubifs_destroy_idx_gc(struct ubifs_info *c)
 {
@@ -824,7 +845,6 @@ void ubifs_destroy_idx_gc(struct ubifs_info *c)
                list_del(&idx_gc->list);
                kfree(idx_gc);
        }
 }
 /**
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01682713af69..e8e632a1dcdf 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -29,7 +29,7 @@
 * would have been wasted for padding to the nearest minimal I/O unit boundary.
 * Instead, data first goes to the write-buffer and is flushed when the
 * buffer is full or when it is not used for some time (by timer). This is
- * similarto the mechanism is used by JFFS2.
+ * similar to the mechanism is used by JFFS2.
 *
 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
 * mutexes defined inside these objects. Since sometimes upper-level code
@@ -75,7 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
 * @lnum: logical eraseblock number
 * @offs: offset within the logical eraseblock
 * @quiet: print no messages
- * @chk_crc: indicates whether to always check the CRC
+ * @must_chk_crc: indicates whether to always check the CRC
 *
 * This function checks node magic number and CRC checksum. This function also
 * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -83,11 +83,17 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
 * node length in the common header could cause UBIFS to read memory outside of
 * allocated buffer when checking the CRC checksum.
 *
- * This function returns zero in case of success %-EUCLEAN in case of bad CRC
+ * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
- * or magic.
+ * true, which is controlled by corresponding UBIFS mount option. However, if
+ * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
+ * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
+ * ignored and CRC is checked.
+ *
+ * This function returns zero in case of success and %-EUCLEAN in case of bad
+ * CRC or magic.
 */
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-                     int offs, int quiet, int chk_crc)
+                     int offs, int quiet, int must_chk_crc)
 {
        int err = -EINVAL, type, node_len;
        uint32_t crc, node_crc, magic;
@@ -123,9 +129,9 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
                   node_len > c->ranges[type].max_len)
                goto out_len;
-        if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc)
+        if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
-                if (c->no_chk_data_crc)
+             c->no_chk_data_crc)
-                        return 0;
+                return 0;
        crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
        node_crc = le32_to_cpu(ch->crc);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 5e82cffe9695..6db7a6be6c97 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -154,6 +154,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case FS_IOC_GETFLAGS:
                flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+                dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
                return put_user(flags, (int __user *) arg);
        case FS_IOC_SETFLAGS: {
@@ -176,6 +177,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                err = mnt_want_write(file->f_path.mnt);
                if (err)
                        return err;
+                dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
                err = setflags(inode, flags);
                mnt_drop_write(file->f_path.mnt);
                return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f91b745908ea..a11ca0958a23 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -191,7 +191,7 @@ again:
        if (wbuf->lnum != -1 && avail >= len) {
                /*
                 * Someone else has switched the journal head and we have
-                 * enough space now. This happens when more then one process is
+                 * enough space now. This happens when more than one process is
                 * trying to write to the same journal head at the same time.
                 */
                dbg_jnl("return LEB %d back, already have LEB %d:%d",
@@ -208,7 +208,7 @@ again:
        offs = 0;
 out:
-        err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
+        err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
        if (err)
                goto out_unlock;
@@ -704,7 +704,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
        data->size = cpu_to_le32(len);
        zero_data_node_unused(data);
-        if (!(ui->flags && UBIFS_COMPR_FL))
+        if (!(ui->flags & UBIFS_COMPR_FL))
                /* Compression is disabled for this inode */
                compr_type = UBIFS_COMPR_NONE;
        else
@@ -1220,7 +1220,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
        data_key_init(c, &key, inum, blk);
        bit = old_size & (UBIFS_BLOCK_SIZE - 1);
-        blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
+        blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1);
        data_key_init(c, &to_key, inum, blk);
        err = ubifs_tnc_remove_range(c, &key, &to_key);
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 3f1f16bc25c9..efb3430a2581 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -38,6 +38,22 @@
 #define __UBIFS_KEY_H__
 /**
+ * key_mask_hash - mask a valid hash value.
+ * @val: value to be masked
+ *
+ * We use hash values as offset in directories, so values %0 and %1 are
+ * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This
+ * function makes sure the reserved values are not used.
+ */
+static inline uint32_t key_mask_hash(uint32_t hash)
+{
+        hash &= UBIFS_S_KEY_HASH_MASK;
+        if (unlikely(hash <= 2))
+                hash += 3;
+        return hash;
+}
+/**
 * key_r5_hash - R5 hash function (borrowed from reiserfs).
 * @s: direntry name
 * @len: name length
@@ -54,16 +70,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
                str++;
        }
-        a &= UBIFS_S_KEY_HASH_MASK;
+        return key_mask_hash(a);
-        /*
-         * We use hash values as offset in directories, so values %0 and %1 are
-         * reserved for "." and "..". %2 is reserved for "end of readdir"
-         * marker.
-         */
-        if (unlikely(a >= 0 && a <= 2))
-                a += 3;
-        return a;
 }
 /**
@@ -77,10 +84,7 @@ static inline uint32_t key_test_hash(const char *str, int len)
        len = min_t(uint32_t, len, 4);
        memcpy(&a, str, len);
-        a &= UBIFS_S_KEY_HASH_MASK;
+        return key_mask_hash(a);
-        if (unlikely(a >= 0 && a <= 2))
-                a += 3;
-        return a;
 }
 /**
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index f27176e9b70d..4cdd284dea56 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -520,13 +520,13 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
 * @flags: new flags
 * @idx_gc_cnt: change to the count of idx_gc list
 *
- * This function changes LEB properties. This function does not change a LEB
+ * This function changes LEB properties (@free, @dirty or @flag). However, the
- * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
+ * property which has the %LPROPS_NC value is not changed. Returns a pointer to
+ * the updated LEB properties on success and a negative error code on failure.
 *
- * This function returns a pointer to the updated LEB properties on success
+ * Note, the LEB properties may have had to be copied (due to COW) and
- * and a negative error code on failure. N.B. the LEB properties may have had to
+ * consequently the pointer returned may not be the same as the pointer
- * be copied (due to COW) and consequently the pointer returned may not be the
+ * passed.
- * same as the pointer passed.
 */
 const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
                                           const struct ubifs_lprops *lp,
@@ -635,10 +635,10 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 * @c: UBIFS file-system description object
 * @st: return statistics
 */
-void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
 {
        spin_lock(&c->space_lock);
-        memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
+        memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
        spin_unlock(&c->space_lock);
 }
@@ -678,6 +678,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
 out:
        ubifs_release_lprops(c);
+        if (err)
+                ubifs_err("cannot change properties of LEB %d, error %d",
+                          lnum, err);
        return err;
 }
@@ -714,6 +717,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
 out:
        ubifs_release_lprops(c);
+        if (err)
+                ubifs_err("cannot update properties of LEB %d, error %d",
+                          lnum, err);
        return err;
 }
@@ -737,6 +743,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
        lpp = ubifs_lpt_lookup(c, lnum);
        if (IS_ERR(lpp)) {
                err = PTR_ERR(lpp);
+                ubifs_err("cannot read properties of LEB %d, error %d",
+                          lnum, err);
                goto out;
        }
@@ -1088,7 +1096,7 @@ static int scan_check_cb(struct ubifs_info *c,
                }
        }
-        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
        if (IS_ERR(sleb)) {
                /*
                 * After an unclean unmount, empty and freeable LEBs
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index db8bd0e518b2..b2792e84d245 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -36,15 +36,16 @@
 * can be written into a single eraseblock. In that case, garbage collection
 * consists of just writing the whole table, which therefore makes all other
 * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
- * selected for garbage collection, which consists are marking the nodes in
+ * selected for garbage collection, which consists of marking the clean nodes in
 * that LEB as dirty, and then only the dirty nodes are written out. Also, in
 * the case of the big model, a table of LEB numbers is saved so that the entire
 * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
 * mounted.
 */
-#include <linux/crc16.h>
 #include "ubifs.h"
+#include <linux/crc16.h>
+#include <linux/math64.h>
 /**
 * do_calc_lpt_geom - calculate sizes for the LPT area.
@@ -135,15 +136,13 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
 int ubifs_calc_lpt_geom(struct ubifs_info *c)
 {
        int lebs_needed;
-        uint64_t sz;
+        long long sz;
        do_calc_lpt_geom(c);
        /* Verify that lpt_lebs is big enough */
        sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
-        sz += c->leb_size - 1;
+        lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
-        do_div(sz, c->leb_size);
-        lebs_needed = sz;
        if (lebs_needed > c->lpt_lebs) {
                ubifs_err("too few LPT LEBs");
                return -EINVAL;
@@ -156,7 +155,6 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c)
        }
        c->check_lpt_free = c->big_lpt;
        return 0;
 }
@@ -176,7 +174,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
                              int *big_lpt)
 {
        int i, lebs_needed;
-        uint64_t sz;
+        long long sz;
        /* Start by assuming the minimum number of LPT LEBs */
        c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
@@ -203,9 +201,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
        /* Now check there are enough LPT LEBs */
        for (i = 0; i < 64 ; i++) {
                sz = c->lpt_sz * 4; /* Allow 4 times the size */
-                sz += c->leb_size - 1;
+                lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
-                do_div(sz, c->leb_size);
-                lebs_needed = sz;
                if (lebs_needed > c->lpt_lebs) {
                        /* Not enough LPT LEBs so try again with more */
                        c->lpt_lebs = lebs_needed;
@@ -558,7 +554,7 @@ static int calc_nnode_num(int row, int col)
 * This function calculates and returns the nnode number based on the parent's
 * nnode number and the index in parent.
 */
-static int calc_nnode_num_from_parent(struct ubifs_info *c,
+static int calc_nnode_num_from_parent(const struct ubifs_info *c,
                                      struct ubifs_nnode *parent, int iip)
 {
        int num, shft;
@@ -583,7 +579,7 @@ static int calc_nnode_num_from_parent(struct ubifs_info *c,
 * This function calculates and returns the pnode number based on the parent's
 * nnode number and the index in parent.
 */
-static int calc_pnode_num_from_parent(struct ubifs_info *c,
+static int calc_pnode_num_from_parent(const struct ubifs_info *c,
                                      struct ubifs_nnode *parent, int iip)
 {
        int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
@@ -966,7 +962,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type)
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_pnode(struct ubifs_info *c, void *buf,
+static int unpack_pnode(const struct ubifs_info *c, void *buf,
                        struct ubifs_pnode *pnode)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
@@ -996,15 +992,15 @@ static int unpack_pnode(struct ubifs_info *c, void *buf,
 }
 /**
- * unpack_nnode - unpack a nnode.
+ * ubifs_unpack_nnode - unpack a nnode.
 * @c: UBIFS file-system description object
 * @buf: buffer containing packed nnode to unpack
 * @nnode: nnode structure to fill
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_nnode(struct ubifs_info *c, void *buf,
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
-                        struct ubifs_nnode *nnode)
+                       struct ubifs_nnode *nnode)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int i, pos = 0, err;
@@ -1036,7 +1032,7 @@ static int unpack_nnode(struct ubifs_info *c, void *buf,
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_ltab(struct ubifs_info *c, void *buf)
+static int unpack_ltab(const struct ubifs_info *c, void *buf)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int i, pos = 0, err;
@@ -1068,7 +1064,7 @@ static int unpack_ltab(struct ubifs_info *c, void *buf)
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_lsave(struct ubifs_info *c, void *buf)
+static int unpack_lsave(const struct ubifs_info *c, void *buf)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int i, pos = 0, err;
@@ -1096,7 +1092,7 @@ static int unpack_lsave(struct ubifs_info *c, void *buf)
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
+static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode,
                          struct ubifs_nnode *parent, int iip)
 {
        int i, lvl, max_offs;
@@ -1140,7 +1136,7 @@ static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode,
                          struct ubifs_nnode *parent, int iip)
 {
        int i;
@@ -1174,7 +1170,8 @@ static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 * This function calculates the LEB numbers for the LEB properties it contains
 * based on the pnode number.
 */
-static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
+static void set_pnode_lnum(const struct ubifs_info *c,
+                           struct ubifs_pnode *pnode)
 {
        int i, lnum;
@@ -1227,7 +1224,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
                err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
                if (err)
                        goto out;
-                err = unpack_nnode(c, buf, nnode);
+                err = ubifs_unpack_nnode(c, buf, nnode);
                if (err)
                        goto out;
        }
@@ -1816,7 +1813,7 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
                               c->nnode_sz);
                if (err)
                        return ERR_PTR(err);
-                err = unpack_nnode(c, buf, nnode);
+                err = ubifs_unpack_nnode(c, buf, nnode);
                if (err)
                        return ERR_PTR(err);
        }
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index a41434b42785..3216a1f277f8 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -320,6 +320,8 @@ no_space:
        dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
                "done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
        dbg_dump_lpt_info(c);
+        dbg_dump_lpt_lebs(c);
+        dump_stack();
        return err;
 }
@@ -546,29 +548,31 @@ static int write_cnodes(struct ubifs_info *c)
 no_space:
        ubifs_err("LPT out of space mismatch");
        dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
-                "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+                "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
        dbg_dump_lpt_info(c);
+        dbg_dump_lpt_lebs(c);
+        dump_stack();
        return err;
 }
 /**
- * next_pnode - find next pnode.
+ * next_pnode_to_dirty - find next pnode to dirty.
 * @c: UBIFS file-system description object
 * @pnode: pnode
 *
- * This function returns the next pnode or %NULL if there are no more pnodes.
+ * This function returns the next pnode to dirty or %NULL if there are no more
+ * pnodes.  Note that pnodes that have never been written (lnum == 0) are
+ * skipped.
 */
-static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
+static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
-                                      struct ubifs_pnode *pnode)
+                                               struct ubifs_pnode *pnode)
 {
        struct ubifs_nnode *nnode;
        int iip;
        /* Try to go right */
        nnode = pnode->parent;
-        iip = pnode->iip + 1;
+        for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
-        if (iip < UBIFS_LPT_FANOUT) {
-                /* We assume here that LEB zero is never an LPT LEB */
                if (nnode->nbranch[iip].lnum)
                        return ubifs_get_pnode(c, nnode, iip);
        }
@@ -579,8 +583,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
                nnode = nnode->parent;
                if (!nnode)
                        return NULL;
-                /* We assume here that LEB zero is never an LPT LEB */
+                for (; iip < UBIFS_LPT_FANOUT; iip++) {
-        } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
+                        if (nnode->nbranch[iip].lnum)
+                                break;
+                }
+       } while (iip >= UBIFS_LPT_FANOUT);
        /* Go right */
        nnode = ubifs_get_nnode(c, nnode, iip);
@@ -589,12 +596,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
        /* Go down to level 1 */
        while (nnode->level > 1) {
-                nnode = ubifs_get_nnode(c, nnode, 0);
+                for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
+                        if (nnode->nbranch[iip].lnum)
+                                break;
+                }
+                if (iip >= UBIFS_LPT_FANOUT) {
+                        /*
+                         * Should not happen, but we need to keep going
+                         * if it does.
+                         */
+                        iip = 0;
+                }
+                nnode = ubifs_get_nnode(c, nnode, iip);
                if (IS_ERR(nnode))
                        return (void *)nnode;
        }
-        return ubifs_get_pnode(c, nnode, 0);
+        for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
+                if (nnode->nbranch[iip].lnum)
+                        break;
+        if (iip >= UBIFS_LPT_FANOUT)
+                /* Should not happen, but we need to keep going if it does */
+                iip = 0;
+        return ubifs_get_pnode(c, nnode, iip);
 }
 /**
@@ -684,7 +708,7 @@ static int make_tree_dirty(struct ubifs_info *c)
        pnode = pnode_lookup(c, 0);
        while (pnode) {
                do_make_pnode_dirty(c, pnode);
-                pnode = next_pnode(c, pnode);
+                pnode = next_pnode_to_dirty(c, pnode);
                if (IS_ERR(pnode))
                        return PTR_ERR(pnode);
        }
@@ -749,7 +773,7 @@ static void lpt_tgc_start(struct ubifs_info *c)
 * LPT trivial garbage collection is where a LPT LEB contains only dirty and
 * free space and so may be reused as soon as the next commit is completed.
 * This function is called after the commit is completed (master node has been
- * written) and unmaps LPT LEBs that were marked for trivial GC.
+ * written) and un-maps LPT LEBs that were marked for trivial GC.
 */
 static int lpt_tgc_end(struct ubifs_info *c)
 {
@@ -1025,7 +1049,7 @@ static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
 * @c: UBIFS file-system description object
 * @node_type: LPT node type
 */
-static int get_lpt_node_len(struct ubifs_info *c, int node_type)
+static int get_lpt_node_len(const struct ubifs_info *c, int node_type)
 {
        switch (node_type) {
        case UBIFS_LPT_NNODE:
@@ -1046,7 +1070,7 @@ static int get_lpt_node_len(struct ubifs_info *c, int node_type)
 * @buf: buffer
 * @len: length of buffer
 */
-static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
+static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len)
 {
        int offs, pad_len;
@@ -1063,7 +1087,8 @@ static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
 * @buf: buffer
 * @node_num: node number is returned here
 */
-static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
+static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf,
+                             int *node_num)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int pos = 0, node_type;
@@ -1081,7 +1106,7 @@ static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
 *
 * This function returns %1 if the buffer contains a node or %0 if it does not.
 */
-static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
+static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int pos = 0, node_type, node_len;
@@ -1105,7 +1130,6 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
        return 1;
 }
 /**
 * lpt_gc_lnum - garbage collect a LPT LEB.
 * @c: UBIFS file-system description object
@@ -1463,7 +1487,7 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
 #ifdef CONFIG_UBIFS_FS_DEBUG
 /**
- * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
+ * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
 * @buf: buffer
 * @len: buffer length
 */
@@ -1488,7 +1512,7 @@ static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
        struct ubifs_nnode *nnode;
        int hght;
-        /* Entire tree is in memory so first_nnode / next_nnode are ok */
+        /* Entire tree is in memory so first_nnode / next_nnode are OK */
        nnode = first_nnode(c, &hght);
        for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
                struct ubifs_nbranch *branch;
@@ -1602,7 +1626,10 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 {
        int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
        int ret;
-        void *buf = c->dbg_buf;
+        void *buf = c->dbg->buf;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
        dbg_lp("LEB %d", lnum);
        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
@@ -1704,6 +1731,9 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
        long long free = 0;
        int i;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
        for (i = 0; i < c->lpt_lebs; i++) {
                if (c->ltab[i].tgc || c->ltab[i].cmt)
                        continue;
@@ -1716,6 +1746,8 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
                dbg_err("LPT space error: free %lld lpt_sz %lld",
                        free, c->lpt_sz);
                dbg_dump_lpt_info(c);
+                dbg_dump_lpt_lebs(c);
+                dump_stack();
                return -EINVAL;
        }
        return 0;
@@ -1731,15 +1763,19 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
+        struct ubifs_debug_info *d = c->dbg;
        long long chk_lpt_sz, lpt_sz;
        int err = 0;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
        switch (action) {
        case 0:
-                c->chk_lpt_sz = 0;
+                d->chk_lpt_sz = 0;
-                c->chk_lpt_sz2 = 0;
+                d->chk_lpt_sz2 = 0;
-                c->chk_lpt_lebs = 0;
+                d->chk_lpt_lebs = 0;
-                c->chk_lpt_wastage = 0;
+                d->chk_lpt_wastage = 0;
                if (c->dirty_pn_cnt > c->pnode_cnt) {
                        dbg_err("dirty pnodes %d exceed max %d",
                                c->dirty_pn_cnt, c->pnode_cnt);
@@ -1752,35 +1788,35 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
                }
                return err;
        case 1:
-                c->chk_lpt_sz += len;
+                d->chk_lpt_sz += len;
                return 0;
        case 2:
-                c->chk_lpt_sz += len;
+                d->chk_lpt_sz += len;
-                c->chk_lpt_wastage += len;
+                d->chk_lpt_wastage += len;
-                c->chk_lpt_lebs += 1;
+                d->chk_lpt_lebs += 1;
                return 0;
        case 3:
                chk_lpt_sz = c->leb_size;
-                chk_lpt_sz *= c->chk_lpt_lebs;
+                chk_lpt_sz *= d->chk_lpt_lebs;
                chk_lpt_sz += len - c->nhead_offs;
-                if (c->chk_lpt_sz != chk_lpt_sz) {
+                if (d->chk_lpt_sz != chk_lpt_sz) {
                        dbg_err("LPT wrote %lld but space used was %lld",
-                                c->chk_lpt_sz, chk_lpt_sz);
+                                d->chk_lpt_sz, chk_lpt_sz);
                        err = -EINVAL;
                }
-                if (c->chk_lpt_sz > c->lpt_sz) {
+                if (d->chk_lpt_sz > c->lpt_sz) {
                        dbg_err("LPT wrote %lld but lpt_sz is %lld",
-                                c->chk_lpt_sz, c->lpt_sz);
+                                d->chk_lpt_sz, c->lpt_sz);
                        err = -EINVAL;
                }
-                if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) {
+                if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
                        dbg_err("LPT layout size %lld but wrote %lld",
-                                c->chk_lpt_sz, c->chk_lpt_sz2);
+                                d->chk_lpt_sz, d->chk_lpt_sz2);
                        err = -EINVAL;
                }
-                if (c->chk_lpt_sz2 && c->new_nhead_offs != len) {
+                if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
                        dbg_err("LPT new nhead offs: expected %d was %d",
-                                c->new_nhead_offs, len);
+                                d->new_nhead_offs, len);
                        err = -EINVAL;
                }
                lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
@@ -1788,26 +1824,146 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
                lpt_sz += c->ltab_sz;
                if (c->big_lpt)
                        lpt_sz += c->lsave_sz;
-                if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) {
+                if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
                        dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
-                                c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz);
+                                d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
                        err = -EINVAL;
                }
-                if (err)
+                if (err) {
                        dbg_dump_lpt_info(c);
-                c->chk_lpt_sz2 = c->chk_lpt_sz;
+                        dbg_dump_lpt_lebs(c);
-                c->chk_lpt_sz = 0;
+                        dump_stack();
-                c->chk_lpt_wastage = 0;
+                }
-                c->chk_lpt_lebs = 0;
+                d->chk_lpt_sz2 = d->chk_lpt_sz;
-                c->new_nhead_offs = len;
+                d->chk_lpt_sz = 0;
+                d->chk_lpt_wastage = 0;
+                d->chk_lpt_lebs = 0;
+                d->new_nhead_offs = len;
                return err;
        case 4:
-                c->chk_lpt_sz += len;
+                d->chk_lpt_sz += len;
-                c->chk_lpt_wastage += len;
+                d->chk_lpt_wastage += len;
                return 0;
        default:
                return -EINVAL;
        }
 }
+/**
+ * dbg_dump_lpt_leb - dump an LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to dump
+ *
+ * This function dumps an LEB from LPT area. Nodes in this area are very
+ * different to nodes in the main area (e.g., they do not have common headers,
+ * they do not have 8-byte alignments, etc), so we have a separate function to
+ * dump LPT area LEBs. Note, LPT has to be locked by the caller.
+ */
+static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
+{
+        int err, len = c->leb_size, node_type, node_num, node_len, offs;
+        void *buf = c->dbg->buf;
+        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+               current->pid, lnum);
+        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+        if (err) {
+                ubifs_err("cannot read LEB %d, error %d", lnum, err);
+                return;
+        }
+        while (1) {
+                offs = c->leb_size - len;
+                if (!is_a_node(c, buf, len)) {
+                        int pad_len;
+                        pad_len = get_pad_len(c, buf, len);
+                        if (pad_len) {
+                                printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
+                                       lnum, offs, pad_len);
+                                buf += pad_len;
+                                len -= pad_len;
+                                continue;
+                        }
+                        if (len)
+                                printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n",
+                                       lnum, offs, len);
+                        break;
+                }
+                node_type = get_lpt_node_type(c, buf, &node_num);
+                switch (node_type) {
+                case UBIFS_LPT_PNODE:
+                {
+                        node_len = c->pnode_sz;
+                        if (c->big_lpt)
+                                printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n",
+                                       lnum, offs, node_num);
+                        else
+                                printk(KERN_DEBUG "LEB %d:%d, pnode\n",
+                                       lnum, offs);
+                        break;
+                }
+                case UBIFS_LPT_NNODE:
+                {
+                        int i;
+                        struct ubifs_nnode nnode;
+                        node_len = c->nnode_sz;
+                        if (c->big_lpt)
+                                printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ",
+                                       lnum, offs, node_num);
+                        else
+                                printk(KERN_DEBUG "LEB %d:%d, nnode, ",
+                                       lnum, offs);
+                        err = ubifs_unpack_nnode(c, buf, &nnode);
+                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                                printk("%d:%d", nnode.nbranch[i].lnum,
+                                       nnode.nbranch[i].offs);
+                                if (i != UBIFS_LPT_FANOUT - 1)
+                                        printk(", ");
+                        }
+                        printk("\n");
+                        break;
+                }
+                case UBIFS_LPT_LTAB:
+                        node_len = c->ltab_sz;
+                        printk(KERN_DEBUG "LEB %d:%d, ltab\n",
+                               lnum, offs);
+                        break;
+                case UBIFS_LPT_LSAVE:
+                        node_len = c->lsave_sz;
+                        printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs);
+                        break;
+                default:
+                        ubifs_err("LPT node type %d not recognized", node_type);
+                        return;
+                }
+                buf += node_len;
+                len -= node_len;
+        }
+        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+               current->pid, lnum);
+}
+/**
+ * dbg_dump_lpt_lebs - dump LPT lebs.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps all LPT LEBs. The caller has to make sure the LPT is
+ * locked.
+ */
+void dbg_dump_lpt_lebs(const struct ubifs_info *c)
+{
+        int i;
+        printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n",
+               current->pid);
+        for (i = 0; i < c->lpt_lebs; i++)
+                dump_lpt_leb(c, i + c->lpt_first);
+        printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n",
+               current->pid);
+}
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 71d5493bf565..a88f33801b98 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -354,7 +354,7 @@ int ubifs_write_master(struct ubifs_info *c)
        int err, lnum, offs, len;
        if (c->ro_media)
-                return -EINVAL;
+                return -EROFS;
        lnum = UBIFS_MST_LNUM;
        offs = c->mst_offs + c->mst_node_alsz;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9bd5a43d4526..152a7b34a141 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -46,7 +46,7 @@
 * Orphans are accumulated in a rb-tree. When an inode's link count drops to
 * zero, the inode number is added to the rb-tree. It is removed from the tree
 * when the inode is deleted.  Any new orphans that are in the orphan tree when
- * the commit is run, are written to the orphan area in 1 or more orph nodes.
+ * the commit is run, are written to the orphan area in 1 or more orphan nodes.
 * If the orphan area is full, it is consolidated to make space.  There is
 * always enough space because validation prevents the user from creating more
 * than the maximum number of orphans allowed.
@@ -231,7 +231,7 @@ static int tot_avail_orphs(struct ubifs_info *c)
 }
 /**
- * do_write_orph_node - write a node
+ * do_write_orph_node - write a node to the orphan head.
 * @c: UBIFS file-system description object
 * @len: length of node
 * @atomic: write atomically
@@ -264,11 +264,11 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
 }
 /**
- * write_orph_node - write an orph node
+ * write_orph_node - write an orphan node.
 * @c: UBIFS file-system description object
 * @atomic: write atomically
 *
- * This function builds an orph node from the cnext list and writes it to the
+ * This function builds an orphan node from the cnext list and writes it to the
 * orphan head. On success, %0 is returned, otherwise a negative error code
 * is returned.
 */
@@ -326,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
 }
 /**
- * write_orph_nodes - write orph nodes until there are no more to commit
+ * write_orph_nodes - write orphan nodes until there are no more to commit.
 * @c: UBIFS file-system description object
 * @atomic: write atomically
 *
- * This function writes orph nodes for all the orphans to commit. On success,
+ * This function writes orphan nodes for all the orphans to commit. On success,
 * %0 is returned, otherwise a negative error code is returned.
 */
 static int write_orph_nodes(struct ubifs_info *c, int atomic)
@@ -478,14 +478,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c)
 }
 /**
- * clear_orphans - erase all LEBs used for orphans.
+ * ubifs_clear_orphans - erase all LEBs used for orphans.
 * @c: UBIFS file-system description object
 *
 * If recovery is not required, then the orphans from the previous session
 * are not needed. This function locates the LEBs used to record
 * orphans, and un-maps them.
 */
-static int clear_orphans(struct ubifs_info *c)
+int ubifs_clear_orphans(struct ubifs_info *c)
 {
        int lnum, err;
@@ -547,9 +547,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
 * do_kill_orphans - remove orphan inodes from the index.
 * @c: UBIFS file-system description object
 * @sleb: scanned LEB
- * @last_cmt_no: cmt_no of last orph node read is passed and returned here
+ * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
 * @outofdate: whether the LEB is out of date is returned here
- * @last_flagged: whether the end orph node is encountered
+ * @last_flagged: whether the end orphan node is encountered
 *
 * This function is a helper to the 'kill_orphans()' function. It goes through
 * every orphan node in a LEB and for every inode number recorded, removes
@@ -580,8 +580,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
                /*
                 * The commit number on the master node may be less, because
                 * of a failed commit. If there are several failed commits in a
-                 * row, the commit number written on orph nodes will continue to
+                 * row, the commit number written on orphan nodes will continue
-                 * increase (because the commit number is adjusted here) even
+                 * to increase (because the commit number is adjusted here) even
                 * though the commit number on the master node stays the same
                 * because the master node has not been re-written.
                 */
@@ -589,9 +589,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
                        c->cmt_no = cmt_no;
                if (cmt_no < *last_cmt_no && *last_flagged) {
                        /*
-                         * The last orph node had a higher commit number and was
+                         * The last orphan node had a higher commit number and
-                         * flagged as the last written for that commit number.
+                         * was flagged as the last written for that commit
-                         * That makes this orph node, out of date.
+                         * number. That makes this orphan node, out of date.
                         */
                        if (!first) {
                                ubifs_err("out of order commit number %llu in "
@@ -658,10 +658,10 @@ static int kill_orphans(struct ubifs_info *c)
        /*
         * Orph nodes always start at c->orph_first and are written to each
         * successive LEB in turn. Generally unused LEBs will have been unmapped
-         * but may contain out of date orph nodes if the unmap didn't go
+         * but may contain out of date orphan nodes if the unmap didn't go
-         * through. In addition, the last orph node written for each commit is
+         * through. In addition, the last orphan node written for each commit is
         * marked (top bit of orph->cmt_no is set to 1). It is possible that
-         * there are orph nodes from the next commit (i.e. the commit did not
+         * there are orphan nodes from the next commit (i.e. the commit did not
         * complete successfully). In that case, no orphans will have been lost
         * due to the way that orphans are written, and any orphans added will
         * be valid orphans anyway and so can be deleted.
@@ -718,7 +718,7 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
        if (unclean)
                err = kill_orphans(c);
        else if (!read_only)
-                err = clear_orphans(c);
+                err = ubifs_clear_orphans(c);
        return err;
 }
@@ -899,7 +899,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
                struct ubifs_scan_leb *sleb;
-                sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+                sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
                if (IS_ERR(sleb)) {
                        err = PTR_ERR(sleb);
                        break;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 21f7d047c306..ce42a7b0ca5a 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -144,7 +144,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                /*
                 * If the replay order was perfect the dirty space would now be
                 * zero. The order is not perfect because the the journal heads
-                 * race with eachother. This is not a problem but is does mean
+                 * race with each other. This is not a problem but is does mean
                 * that the dirty space may temporarily exceed c->leb_size
                 * during the replay.
                 */
@@ -656,7 +656,7 @@ out_dump:
 * @dirty: amount of dirty space from padding and deletion nodes
 *
 * This function inserts a reference node to the replay tree and returns zero
- * in case of success ort a negative error code in case of failure.
+ * in case of success or a negative error code in case of failure.
 */
 static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
                           unsigned long long sqnum, int free, int dirty)
@@ -883,7 +883,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
                 * This means that we reached end of log and now
                 * look to the older log data, which was already
                 * committed but the eraseblock was not erased (UBIFS
-                 * only unmaps it). So this basically means we have to
+                 * only un-maps it). So this basically means we have to
                 * exit with "end of log" code.
                 */
                err = 1;
@@ -1062,6 +1062,15 @@ int ubifs_replay_journal(struct ubifs_info *c)
        if (err)
                goto out;
+        /*
+         * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
+         * to roughly estimate index growth. Things like @c->min_idx_lebs
+         * depend on it. This means we have to initialize it to make sure
+         * budgeting works properly.
+         */
+        c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+        c->budg_uncommitted_idx *= c->max_idx_node_sz;
        ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
        dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
                "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 0f392351dc5a..e070c643d1bb 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,6 +28,7 @@
 #include "ubifs.h"
 #include <linux/random.h>
+#include <linux/math64.h>
 /*
 * Default journal size in logical eraseblocks as a percent of total
@@ -80,7 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
        int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
        int min_leb_cnt = UBIFS_MIN_LEB_CNT;
-        uint64_t tmp64, main_bytes;
+        long long tmp64, main_bytes;
        __le64 tmp_le64;
        /* Some functions called from here depend on the @c->key_len filed */
@@ -160,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        if (!sup)
                return -ENOMEM;
-        tmp64 = (uint64_t)max_buds * c->leb_size;
+        tmp64 = (long long)max_buds * c->leb_size;
        if (big_lpt)
                sup_flags |= UBIFS_FLG_BIGLPT;
@@ -179,14 +180,16 @@ static int create_default_filesystem(struct ubifs_info *c)
        sup->fanout        = cpu_to_le32(DEFAULT_FANOUT);
        sup->lsave_cnt     = cpu_to_le32(c->lsave_cnt);
        sup->fmt_version   = cpu_to_le32(UBIFS_FORMAT_VERSION);
-        sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
        sup->time_gran     = cpu_to_le32(DEFAULT_TIME_GRAN);
+        if (c->mount_opts.override_compr)
+                sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
+        else
+                sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
        generate_random_uuid(sup->uuid);
-        main_bytes = (uint64_t)main_lebs * c->leb_size;
+        main_bytes = (long long)main_lebs * c->leb_size;
-        tmp64 = main_bytes * DEFAULT_RP_PERCENT;
+        tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);
-        do_div(tmp64, 100);
        if (tmp64 > DEFAULT_MAX_RP_SIZE)
                tmp64 = DEFAULT_MAX_RP_SIZE;
        sup->rp_size = cpu_to_le64(tmp64);
@@ -582,16 +585,15 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->jhead_cnt     = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
        c->fanout        = le32_to_cpu(sup->fanout);
        c->lsave_cnt     = le32_to_cpu(sup->lsave_cnt);
-        c->default_compr = le16_to_cpu(sup->default_compr);
        c->rp_size       = le64_to_cpu(sup->rp_size);
        c->rp_uid        = le32_to_cpu(sup->rp_uid);
        c->rp_gid        = le32_to_cpu(sup->rp_gid);
        sup_flags        = le32_to_cpu(sup->flags);
+        if (!c->mount_opts.override_compr)
+                c->default_compr = le16_to_cpu(sup->default_compr);
        c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
        memcpy(&c->uuid, &sup->uuid, 16);
        c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
        /* Automatically increase file system size to the maximum size */
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f248533841a2..e7bab52a1410 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
 * @contention: if any contention, this is set to %1
 *
 * This function walks the list of mounted UBIFS file-systems and frees clean
- * znodes which are older then @age, until at least @nr znodes are freed.
+ * znodes which are older than @age, until at least @nr znodes are freed.
 * Returns the number of freed znodes.
 */
 static int shrink_tnc_trees(int nr, int age, int *contention)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d80b2aef42b6..1182b66a5491 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -34,6 +34,8 @@
 #include <linux/parser.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
 #include "ubifs.h"
 /*
@@ -395,6 +397,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = UBIFS_MAX_NLEN;
        buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
        buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
+        ubifs_assert(buf->f_bfree <= c->block_cnt);
        return 0;
 }
@@ -417,39 +420,62 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
        else if (c->mount_opts.chk_data_crc == 1)
                seq_printf(s, ",no_chk_data_crc");
+        if (c->mount_opts.override_compr) {
+                seq_printf(s, ",compr=");
+                seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+        }
        return 0;
 }
 static int ubifs_sync_fs(struct super_block *sb, int wait)
 {
+        int i, err;
        struct ubifs_info *c = sb->s_fs_info;
-        int i, ret = 0, err;
+        struct writeback_control wbc = {
-        long long bud_bytes;
+                .sync_mode   = WB_SYNC_ALL,
+                .range_start = 0,
+                .range_end   = LLONG_MAX,
+                .nr_to_write = LONG_MAX,
+        };
-        if (c->jheads) {
+        /*
-                for (i = 0; i < c->jhead_cnt; i++) {
+         * Zero @wait is just an advisory thing to help the file system shove
-                        err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+         * lots of data into the queues, and there will be the second
-                        if (err && !ret)
+         * '->sync_fs()' call, with non-zero @wait.
-                                ret = err;
+         */
-                }
+        if (!wait)
+                return 0;
-                /* Commit the journal unless it has too little data */
+        if (sb->s_flags & MS_RDONLY)
-                spin_lock(&c->buds_lock);
+                return 0;
-                bud_bytes = c->bud_bytes;
-                spin_unlock(&c->buds_lock);
+        /*
-                if (bud_bytes > c->leb_size) {
+         * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
-                        err = ubifs_run_commit(c);
+         * pages, so synchronize them first, then commit the journal. Strictly
-                        if (err)
+         * speaking, it is not necessary to commit the journal here,
-                                return err;
+         * synchronizing write-buffers would be enough. But committing makes
-                }
+         * UBIFS free space predictions much more accurate, so we want to let
-        }
+         * the user be able to get more accurate results of 'statfs()' after
+         * they synchronize the file system.
+         */
+        generic_sync_sb_inodes(sb, &wbc);
        /*
-         * We ought to call sync for c->ubi but it does not have one. If it had
+         * Synchronize write buffers, because 'ubifs_run_commit()' does not
-         * it would in turn call mtd->sync, however mtd operations are
+         * do this if it waits for an already running commit.
-         * synchronous anyway, so we don't lose any sleep here.
         */
-        return ret;
+        for (i = 0; i < c->jhead_cnt; i++) {
+                err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+                if (err)
+                        return err;
+        }
+        err = ubifs_run_commit(c);
+        if (err)
+                return err;
+        return ubi_sync(c->vi.ubi_num);
 }
 /**
@@ -548,15 +574,8 @@ static int init_constants_early(struct ubifs_info *c)
        c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
        /*
-         * Initialize dead and dark LEB space watermarks.
+         * Initialize dead and dark LEB space watermarks. See gc.c for comments
-         *
+         * about these values.
-         * Dead space is the space which cannot be used. Its watermark is
-         * equivalent to min. I/O unit or minimum node size if it is greater
-         * then min. I/O unit.
-         *
-         * Dark space is the space which might be used, or might not, depending
-         * on which node should be written to the LEB. Its watermark is
-         * equivalent to maximum UBIFS node size.
         */
        c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
        c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
@@ -596,7 +615,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
 }
 /*
- * init_constants_late - initialize UBIFS constants.
+ * init_constants_sb - initialize UBIFS constants.
 * @c: UBIFS file-system description object
 *
 * This is a helper function which initializes various UBIFS constants after
@@ -604,10 +623,10 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
 * makes sure they are all right. Returns zero in case of success and a
 * negative error code in case of failure.
 */
-static int init_constants_late(struct ubifs_info *c)
+static int init_constants_sb(struct ubifs_info *c)
 {
        int tmp, err;
-        uint64_t tmp64;
+        long long tmp64;
        c->main_bytes = (long long)c->main_lebs * c->leb_size;
        c->max_znode_sz = sizeof(struct ubifs_znode) +
@@ -634,9 +653,8 @@ static int init_constants_late(struct ubifs_info *c)
         * Make sure that the log is large enough to fit reference nodes for
         * all buds plus one reserved LEB.
         */
-        tmp64 = c->max_bud_bytes;
+        tmp64 = c->max_bud_bytes + c->leb_size - 1;
-        tmp = do_div(tmp64, c->leb_size);
+        c->max_bud_cnt = div_u64(tmp64, c->leb_size);
-        c->max_bud_cnt = tmp64 + !!tmp;
        tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
        tmp /= c->leb_size;
        tmp += 1;
@@ -672,7 +690,7 @@ static int init_constants_late(struct ubifs_info *c)
         * Consequently, if the journal is too small, UBIFS will treat it as
         * always full.
         */
-        tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
+        tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1;
        if (c->bg_bud_bytes < tmp64)
                c->bg_bud_bytes = tmp64;
        if (c->max_bud_bytes < tmp64 + c->leb_size)
@@ -682,6 +700,21 @@ static int init_constants_late(struct ubifs_info *c)
        if (err)
                return err;
+        return 0;
+}
+/*
+ * init_constants_master - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the master node has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right.
+ */
+static void init_constants_master(struct ubifs_info *c)
+{
+        long long tmp64;
        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        /*
@@ -690,26 +723,25 @@ static int init_constants_late(struct ubifs_info *c)
         * necessary to report something for the 'statfs()' call.
         *
         * Subtract the LEB reserved for GC, the LEB which is reserved for
-         * deletions, and assume only one journal head is available.
+         * deletions, minimum LEBs for the index, and assume only one journal
+         * head is available.
         */
-        tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
+        tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
-        tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
+        tmp64 *= (long long)c->leb_size - c->leb_overhead;
        tmp64 = ubifs_reported_space(c, tmp64);
        c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
-        return 0;
 }
 /**
 * take_gc_lnum - reserve GC LEB.
 * @c: UBIFS file-system description object
 *
- * This function ensures that the LEB reserved for garbage collection is
+ * This function ensures that the LEB reserved for garbage collection is marked
- * unmapped and is marked as "taken" in lprops. We also have to set free space
+ * as "taken" in lprops. We also have to set free space to LEB size and dirty
- * to LEB size and dirty space to zero, because lprops may contain out-of-date
+ * space to zero, because lprops may contain out-of-date information if the
- * information if the file-system was un-mounted before it has been committed.
+ * file-system was un-mounted before it has been committed. This function
- * This function returns zero in case of success and a negative error code in
+ * returns zero in case of success and a negative error code in case of
- * case of failure.
+ * failure.
 */
 static int take_gc_lnum(struct ubifs_info *c)
 {
@@ -720,10 +752,6 @@ static int take_gc_lnum(struct ubifs_info *c)
                return -EINVAL;
        }
-        err = ubifs_leb_unmap(c, c->gc_lnum);
-        if (err)
-                return err;
        /* And we have to tell lprops that this LEB is taken */
        err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
                                  LPROPS_TAKEN, 0, 0);
@@ -878,6 +906,7 @@ static int check_volume_empty(struct ubifs_info *c)
 * Opt_no_bulk_read: disable bulk-reads
 * Opt_chk_data_crc: check CRCs when reading data nodes
 * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
+ * Opt_override_compr: override default compressor
 * Opt_err: just end of array marker
 */
 enum {
@@ -887,6 +916,7 @@ enum {
        Opt_no_bulk_read,
        Opt_chk_data_crc,
        Opt_no_chk_data_crc,
+        Opt_override_compr,
        Opt_err,
 };
@@ -897,6 +927,7 @@ static const match_table_t tokens = {
        {Opt_no_bulk_read, "no_bulk_read"},
        {Opt_chk_data_crc, "chk_data_crc"},
        {Opt_no_chk_data_crc, "no_chk_data_crc"},
+        {Opt_override_compr, "compr=%s"},
        {Opt_err, NULL},
 };
@@ -926,13 +957,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
                token = match_token(p, tokens, args);
                switch (token) {
+                /*
+                 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
+                 * We accepte them in order to be backware-compatible. But this
+                 * should be removed at some point.
+                 */
                case Opt_fast_unmount:
                        c->mount_opts.unmount_mode = 2;
-                        c->fast_unmount = 1;
                        break;
                case Opt_norm_unmount:
                        c->mount_opts.unmount_mode = 1;
-                        c->fast_unmount = 0;
                        break;
                case Opt_bulk_read:
                        c->mount_opts.bulk_read = 2;
@@ -950,6 +984,28 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
                        c->mount_opts.chk_data_crc = 1;
                        c->no_chk_data_crc = 1;
                        break;
+                case Opt_override_compr:
+                {
+                        char *name = match_strdup(&args[0]);
+                        if (!name)
+                                return -ENOMEM;
+                        if (!strcmp(name, "none"))
+                                c->mount_opts.compr_type = UBIFS_COMPR_NONE;
+                        else if (!strcmp(name, "lzo"))
+                                c->mount_opts.compr_type = UBIFS_COMPR_LZO;
+                        else if (!strcmp(name, "zlib"))
+                                c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
+                        else {
+                                ubifs_err("unknown compressor \"%s\"", name);
+                                kfree(name);
+                                return -EINVAL;
+                        }
+                        kfree(name);
+                        c->mount_opts.override_compr = 1;
+                        c->default_compr = c->mount_opts.compr_type;
+                        break;
+                }
                default:
                        ubifs_err("unrecognized mount option \"%s\" "
                                  "or missing value", p);
@@ -1019,6 +1075,25 @@ again:
 }
 /**
+ * check_free_space - check if there is enough free space to mount.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free space to be mounted in
+ * read/write mode. UBIFS must always have some free space to allow deletions.
+ */
+static int check_free_space(struct ubifs_info *c)
+{
+        ubifs_assert(c->dark_wm > 0);
+        if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
+                ubifs_err("insufficient free space to mount in read/write mode");
+                dbg_dump_budg(c);
+                dbg_dump_lprops(c);
+                return -ENOSPC;
+        }
+        return 0;
+}
+/**
 * mount_ubifs - mount UBIFS file-system.
 * @c: UBIFS file-system description object
 *
@@ -1039,11 +1114,9 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                return err;
-#ifdef CONFIG_UBIFS_FS_DEBUG
+        err = ubifs_debugging_init(c);
-        c->dbg_buf = vmalloc(c->leb_size);
+        if (err)
-        if (!c->dbg_buf)
+                return err;
-                return -ENOMEM;
-#endif
        err = check_volume_empty(c);
        if (err)
@@ -1100,27 +1173,25 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_free;
        /*
-         * Make sure the compressor which is set as the default on in the
+         * Make sure the compressor which is set as default in the superblock
-         * superblock was actually compiled in.
+         * or overridden by mount options is actually compiled in.
         */
        if (!ubifs_compr_present(c->default_compr)) {
-                ubifs_warn("'%s' compressor is set by superblock, but not "
+                ubifs_err("'compressor \"%s\" is not compiled in",
-                           "compiled in", ubifs_compr_name(c->default_compr));
+                          ubifs_compr_name(c->default_compr));
-                c->default_compr = UBIFS_COMPR_NONE;
+                goto out_free;
        }
-        dbg_failure_mode_registration(c);
+        err = init_constants_sb(c);
-        err = init_constants_late(c);
        if (err)
-                goto out_dereg;
+                goto out_free;
        sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
        sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
        c->cbuf = kmalloc(sz, GFP_NOFS);
        if (!c->cbuf) {
                err = -ENOMEM;
-                goto out_dereg;
+                goto out_free;
        }
        sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
@@ -1145,6 +1216,8 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_master;
+        init_constants_master(c);
        if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
                ubifs_msg("recovery needed");
                c->need_recovery = 1;
@@ -1183,12 +1256,9 @@ static int mount_ubifs(struct ubifs_info *c)
        if (!mounted_read_only) {
                int lnum;
-                /* Check for enough free space */
+                err = check_free_space(c);
-                if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+                if (err)
-                        ubifs_err("insufficient available space");
-                        err = -EINVAL;
                        goto out_orphans;
-                }
                /* Check for enough log space */
                lnum = c->lhead_lnum + 1;
@@ -1205,10 +1275,19 @@ static int mount_ubifs(struct ubifs_info *c)
                        if (err)
                                goto out_orphans;
                        err = ubifs_rcvry_gc_commit(c);
-                } else
+                } else {
                        err = take_gc_lnum(c);
-                if (err)
+                        if (err)
-                        goto out_orphans;
+                                goto out_orphans;
+                        /*
+                         * GC LEB may contain garbage if there was an unclean
+                         * reboot, and it should be un-mapped.
+                         */
+                        err = ubifs_leb_unmap(c, c->gc_lnum);
+                        if (err)
+                                return err;
+                }
                err = dbg_check_lprops(c);
                if (err)
@@ -1217,6 +1296,16 @@ static int mount_ubifs(struct ubifs_info *c)
                err = ubifs_recover_size(c);
                if (err)
                        goto out_orphans;
+        } else {
+                /*
+                 * Even if we mount read-only, we have to set space in GC LEB
+                 * to proper value because this affects UBIFS free space
+                 * reporting. We do not want to have a situation when
+                 * re-mounting from R/O to R/W changes amount of free space.
+                 */
+                err = take_gc_lnum(c);
+                if (err)
+                        goto out_orphans;
        }
        spin_lock(&ubifs_infos_lock);
@@ -1229,13 +1318,20 @@ static int mount_ubifs(struct ubifs_info *c)
                else {
                        c->need_recovery = 0;
                        ubifs_msg("recovery completed");
+                        /* GC LEB has to be empty and taken at this point */
+                        ubifs_assert(c->lst.taken_empty_lebs == 1);
                }
-        }
+        } else
+                ubifs_assert(c->lst.taken_empty_lebs == 1);
        err = dbg_check_filesystem(c);
        if (err)
                goto out_infos;
+        err = dbg_debugfs_init_fs(c);
+        if (err)
+                goto out_infos;
        c->always_chk_crc = 0;
        ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
@@ -1266,7 +1362,6 @@ static int mount_ubifs(struct ubifs_info *c)
               c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
               c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
               c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
-        dbg_msg("fast unmount:        %d", c->fast_unmount);
        dbg_msg("big_lpt              %d", c->big_lpt);
        dbg_msg("log LEBs:            %d (%d - %d)",
                c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1283,8 +1378,20 @@ static int mount_ubifs(struct ubifs_info *c)
        dbg_msg("tree fanout:         %d", c->fanout);
        dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
        dbg_msg("first main LEB:      %d", c->main_first);
+        dbg_msg("max. znode size      %d", c->max_znode_sz);
+        dbg_msg("max. index node size %d", c->max_idx_node_sz);
+        dbg_msg("node sizes:          data %zu, inode %zu, dentry %zu",
+                UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
+        dbg_msg("node sizes:          trun %zu, sb %zu, master %zu",
+                UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
+        dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
+                UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
+        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu",
+                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+                UBIFS_MAX_DENT_NODE_SZ);
        dbg_msg("dead watermark:      %d", c->dead_wm);
        dbg_msg("dark watermark:      %d", c->dark_wm);
+        dbg_msg("LEB overhead:        %d", c->leb_overhead);
        x = (long long)c->main_lebs * c->dark_wm;
        dbg_msg("max. dark space:     %lld (%lld KiB, %lld MiB)",
                x, x >> 10, x >> 20);
@@ -1320,14 +1427,12 @@ out_wbufs:
        free_wbufs(c);
 out_cbuf:
        kfree(c->cbuf);
-out_dereg:
-        dbg_failure_mode_deregistration(c);
 out_free:
        kfree(c->bu.buf);
        vfree(c->ileb_buf);
        vfree(c->sbuf);
        kfree(c->bottom_up_buf);
-        UBIFS_DBG(vfree(c->dbg_buf));
+        ubifs_debugging_exit(c);
        return err;
 }
@@ -1345,6 +1450,7 @@ static void ubifs_umount(struct ubifs_info *c)
        dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
                c->vi.vol_id);
+        dbg_debugfs_exit_fs(c);
        spin_lock(&ubifs_infos_lock);
        list_del(&c->infos_list);
        spin_unlock(&ubifs_infos_lock);
@@ -1364,8 +1470,7 @@ static void ubifs_umount(struct ubifs_info *c)
        vfree(c->ileb_buf);
        vfree(c->sbuf);
        kfree(c->bottom_up_buf);
-        UBIFS_DBG(vfree(c->dbg_buf));
+        ubifs_debugging_exit(c);
-        dbg_failure_mode_deregistration(c);
 }
 /**
@@ -1380,19 +1485,14 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
        int err, lnum;
-        if (c->ro_media)
-                return -EINVAL;
        mutex_lock(&c->umount_mutex);
+        dbg_save_space_info(c);
        c->remounting_rw = 1;
        c->always_chk_crc = 1;
-        /* Check for enough free space */
+        err = check_free_space(c);
-        if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+        if (err)
-                ubifs_err("insufficient available space");
-                err = -EINVAL;
                goto out;
-        }
        if (c->old_leb_cnt != c->leb_cnt) {
                struct ubifs_sb_node *sup;
@@ -1422,6 +1522,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)
                err = ubifs_recover_inl_heads(c, c->sbuf);
                if (err)
                        goto out;
+        } else {
+                /* A readonly mount is not allowed to have orphans */
+                ubifs_assert(c->tot_orphans == 0);
+                err = ubifs_clear_orphans(c);
+                if (err)
+                        goto out;
        }
        if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
@@ -1477,7 +1583,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        if (c->need_recovery)
                err = ubifs_rcvry_gc_commit(c);
        else
-                err = take_gc_lnum(c);
+                err = ubifs_leb_unmap(c, c->gc_lnum);
        if (err)
                goto out;
@@ -1490,8 +1596,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        c->vfs_sb->s_flags &= ~MS_RDONLY;
        c->remounting_rw = 0;
        c->always_chk_crc = 0;
+        err = dbg_check_space_info(c);
        mutex_unlock(&c->umount_mutex);
-        return 0;
+        return err;
 out:
        vfree(c->orph_buf);
@@ -1511,39 +1618,18 @@ out:
 }
 /**
- * commit_on_unmount - commit the journal when un-mounting.
- * @c: UBIFS file-system description object
- *
- * This function is called during un-mounting and re-mounting, and it commits
- * the journal unless the "fast unmount" mode is enabled. It also avoids
- * committing the journal if it contains too few data.
- */
-static void commit_on_unmount(struct ubifs_info *c)
-{
-        if (!c->fast_unmount) {
-                long long bud_bytes;
-                spin_lock(&c->buds_lock);
-                bud_bytes = c->bud_bytes;
-                spin_unlock(&c->buds_lock);
-                if (bud_bytes > c->leb_size)
-                        ubifs_run_commit(c);
-        }
-}
-/**
 * ubifs_remount_ro - re-mount in read-only mode.
 * @c: UBIFS file-system description object
 *
- * We rely on VFS to have stopped writing. Possibly the background thread could
+ * We assume VFS has stopped writing. Possibly the background thread could be
- * be running a commit, however kthread_stop will wait in that case.
+ * running a commit, however kthread_stop will wait in that case.
 */
 static void ubifs_remount_ro(struct ubifs_info *c)
 {
        int i, err;
        ubifs_assert(!c->need_recovery);
-        commit_on_unmount(c);
+        ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
        mutex_lock(&c->umount_mutex);
        if (c->bgt) {
@@ -1551,27 +1637,29 @@ static void ubifs_remount_ro(struct ubifs_info *c)
                c->bgt = NULL;
        }
+        dbg_save_space_info(c);
        for (i = 0; i < c->jhead_cnt; i++) {
                ubifs_wbuf_sync(&c->jheads[i].wbuf);
                del_timer_sync(&c->jheads[i].wbuf.timer);
        }
-        if (!c->ro_media) {
+        c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
-                c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+        c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
-                c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+        c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
-                c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+        err = ubifs_write_master(c);
-                err = ubifs_write_master(c);
+        if (err)
-                if (err)
+                ubifs_ro_mode(c, err);
-                        ubifs_ro_mode(c, err);
-        }
-        ubifs_destroy_idx_gc(c);
        free_wbufs(c);
        vfree(c->orph_buf);
        c->orph_buf = NULL;
        vfree(c->ileb_buf);
        c->ileb_buf = NULL;
        ubifs_lpt_free(c, 1);
+        err = dbg_check_space_info(c);
+        if (err)
+                ubifs_ro_mode(c, err);
        mutex_unlock(&c->umount_mutex);
 }
@@ -1664,11 +1752,20 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
        }
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+                if (c->ro_media) {
+                        ubifs_msg("cannot re-mount due to prior errors");
+                        return -EROFS;
+                }
                err = ubifs_remount_rw(c);
                if (err)
                        return err;
-        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+                if (c->ro_media) {
+                        ubifs_msg("cannot re-mount due to prior errors");
+                        return -EROFS;
+                }
                ubifs_remount_ro(c);
+        }
        if (c->bulk_read == 1)
                bu_init(c);
@@ -1678,10 +1775,11 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                c->bu.buf = NULL;
        }
+        ubifs_assert(c->lst.taken_empty_lebs == 1);
        return 0;
 }
-struct super_operations ubifs_super_operations = {
+const struct super_operations ubifs_super_operations = {
        .alloc_inode   = ubifs_alloc_inode,
        .destroy_inode = ubifs_destroy_inode,
        .put_super     = ubifs_put_super,
@@ -1849,7 +1947,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
                goto out_iput;
        mutex_unlock(&c->umount_mutex);
        return 0;
 out_iput:
@@ -1949,15 +2046,6 @@ out_close:
 static void ubifs_kill_sb(struct super_block *sb)
 {
-        struct ubifs_info *c = sb->s_fs_info;
-        /*
-         * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
-         * in order to be outside BKL.
-         */
-        if (sb->s_root && !(sb->s_flags & MS_RDONLY))
-                commit_on_unmount(c);
-        /* The un-mount routine is actually done in put_super() */
        generic_shutdown_super(sb);
 }
@@ -2021,6 +2109,14 @@ static int __init ubifs_init(void)
        BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
        /*
+         * We use 2 bit wide bit-fields to store compression type, which should
+         * be amended if more compressors are added. The bit-fields are:
+         * @compr_type in 'struct ubifs_inode', @default_compr in
+         * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'.
+         */
+        BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
+        /*
         * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
         * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
         */
@@ -2049,11 +2145,17 @@ static int __init ubifs_init(void)
        err = ubifs_compressors_init();
        if (err)
+                goto out_shrinker;
+        err = dbg_debugfs_init();
+        if (err)
                goto out_compr;
        return 0;
 out_compr:
+        ubifs_compressors_exit();
+out_shrinker:
        unregister_shrinker(&ubifs_shrinker_info);
        kmem_cache_destroy(ubifs_inode_slab);
 out_reg:
@@ -2068,6 +2170,7 @@ static void __exit ubifs_exit(void)
        ubifs_assert(list_empty(&ubifs_infos));
        ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+        dbg_debugfs_exit();
        ubifs_compressors_exit();
        unregister_shrinker(&ubifs_shrinker_info);
        kmem_cache_destroy(ubifs_inode_slab);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6eef5344a145..fa28a84c6a1b 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -443,6 +443,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 * This function performs that same function as ubifs_read_node except that
 * it does not require that there is actually a node present and instead
 * the return code indicates if a node was read.
+ *
+ * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
+ * is true (it is controlled by corresponding mount option). However, if
+ * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
+ * checked.
 */
 static int try_read_node(const struct ubifs_info *c, void *buf, int type,
                         int len, int lnum, int offs)
@@ -470,9 +475,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
        if (node_len != len)
                return 0;
-        if (type == UBIFS_DATA_NODE && !c->always_chk_crc)
+        if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
-                if (c->no_chk_data_crc)
+                return 1;
-                        return 0;
        crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
        node_crc = le32_to_cpu(ch->crc);
@@ -1506,7 +1510,7 @@ out:
 *
 * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
 * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
- * maxumum possible amount of nodes for bulk-read.
+ * maximum possible amount of nodes for bulk-read.
 */
 int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
 {
@@ -2245,12 +2249,11 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
                        if (found) {
                                /* Ensure the znode is dirtied */
                                if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                                            znode = dirty_cow_bottom_up(c,
+                                        znode = dirty_cow_bottom_up(c, znode);
-                                                                        znode);
+                                        if (IS_ERR(znode)) {
-                                            if (IS_ERR(znode)) {
+                                                err = PTR_ERR(znode);
-                                                    err = PTR_ERR(znode);
+                                                goto out_unlock;
-                                                    goto out_unlock;
+                                        }
-                                            }
                                }
                                zbr = &znode->zbranch[n];
                                lnc_free(zbr);
@@ -2317,11 +2320,11 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
                /* Ensure the znode is dirtied */
                if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                            znode = dirty_cow_bottom_up(c, znode);
+                        znode = dirty_cow_bottom_up(c, znode);
-                            if (IS_ERR(znode)) {
+                        if (IS_ERR(znode)) {
-                                    err = PTR_ERR(znode);
+                                err = PTR_ERR(znode);
-                                    goto out_unlock;
+                                goto out_unlock;
-                            }
+                        }
                }
                if (found == 1) {
@@ -2627,11 +2630,11 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
                /* Ensure the znode is dirtied */
                if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                            znode = dirty_cow_bottom_up(c, znode);
+                        znode = dirty_cow_bottom_up(c, znode);
-                            if (IS_ERR(znode)) {
+                        if (IS_ERR(znode)) {
-                                    err = PTR_ERR(znode);
+                                err = PTR_ERR(znode);
-                                    goto out_unlock;
+                                goto out_unlock;
-                            }
+                        }
                }
                /* Remove all keys in range except the first */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8ac76b1c2d55..fde8d127c768 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -553,8 +553,8 @@ static int layout_in_empty_space(struct ubifs_info *c)
        }
 #ifdef CONFIG_UBIFS_FS_DEBUG
-        c->new_ihead_lnum = lnum;
+        c->dbg->new_ihead_lnum = lnum;
-        c->new_ihead_offs = buf_offs;
+        c->dbg->new_ihead_offs = buf_offs;
 #endif
        return 0;
@@ -802,8 +802,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
         * budgeting subsystem to assume the index is already committed,
         * even though it is not.
         */
+        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
        c->old_idx_sz = c->calc_idx_sz;
        c->budg_uncommitted_idx = 0;
+        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        spin_unlock(&c->space_lock);
        mutex_unlock(&c->tnc_mutex);
@@ -1002,7 +1004,8 @@ static int write_index(struct ubifs_info *c)
        }
 #ifdef CONFIG_UBIFS_FS_DEBUG
-        if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
+        if (lnum != c->dbg->new_ihead_lnum ||
+            buf_offs != c->dbg->new_ihead_offs) {
                ubifs_err("inconsistent ihead");
                return -EINVAL;
        }
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0b378042a3a2..b25fc36cf72f 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -51,6 +51,13 @@
 */
 #define UBIFS_MIN_COMPR_LEN 128
+/*
+ * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
+ * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * node uncompress, because it'll be read faster.
+ */
+#define UBIFS_MIN_COMPRESS_DIFF 64
 /* Root inode number */
 #define UBIFS_ROOT_INO 1
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 46b172560a06..039a68bee29a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -63,6 +63,14 @@
 #define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
 #define SQNUM_WATERMARK      0xFFFFFFFFFF000000ULL
+/*
+ * Minimum amount of LEBs reserved for the index. At present the index needs at
+ * least 2 LEBs: one for the index head and one for in-the-gaps method (which
+ * currently does not cater for the index head and so excludes it from
+ * consideration).
+ */
+#define MIN_INDEX_LEBS 2
 /* Minimum amount of data UBIFS writes to the flash */
 #define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
@@ -386,12 +394,12 @@ struct ubifs_inode {
        unsigned int dirty:1;
        unsigned int xattr:1;
        unsigned int bulk_read:1;
+        unsigned int compr_type:2;
        struct mutex ui_mutex;
        spinlock_t ui_lock;
        loff_t synced_i_size;
        loff_t ui_size;
        int flags;
-        int compr_type;
        pgoff_t last_page_read;
        pgoff_t read_in_a_row;
        int data_len;
@@ -418,9 +426,9 @@ struct ubifs_unclean_leb {
 * LEB properties flags.
 *
 * LPROPS_UNCAT: not categorized
- * LPROPS_DIRTY: dirty > 0, not index
+ * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
- * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
+ * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
- * LPROPS_FREE: free > 0, not empty, not index
+ * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
 * LPROPS_EMPTY: LEB is empty, not taken
 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
@@ -473,8 +481,8 @@ struct ubifs_lprops {
 struct ubifs_lpt_lprops {
        int free;
        int dirty;
-        unsigned tgc : 1;
+        unsigned tgc:1;
-        unsigned cmt : 1;
+        unsigned cmt:1;
 };
 /**
@@ -482,24 +490,26 @@ struct ubifs_lpt_lprops {
 * @empty_lebs: number of empty LEBs
 * @taken_empty_lebs: number of taken LEBs
 * @idx_lebs: number of indexing LEBs
- * @total_free: total free space in bytes
+ * @total_free: total free space in bytes (includes all LEBs)
- * @total_dirty: total dirty space in bytes
+ * @total_dirty: total dirty space in bytes (includes all LEBs)
- * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_used: total used space in bytes (does not include index LEBs)
- * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (does not include index LEBs)
- * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (does not include index LEBs)
+ *
+ * The @taken_empty_lebs field counts the LEBs that are in the transient state
+ * of having been "taken" for use but not yet written to. @taken_empty_lebs is
+ * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be
+ * used by itself (in which case 'unused_lebs' would be a better name). In the
+ * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained
+ * by GC, but unlike other empty LEBs that are "taken", it may not be written
+ * straight away (i.e. before the next commit start or unmount), so either
+ * @gc_lnum must be specially accounted for, or the current approach followed
+ * i.e. count it under @taken_empty_lebs.
 *
- * N.B. total_dirty and total_used are different to other total_* fields,
+ * @empty_lebs includes @taken_empty_lebs.
- * because they account _all_ LEBs, not just data LEBs.
 *
- * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
+ * @total_used, @total_dead and @total_dark fields do not account indexing
- * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
+ * LEBs.
- * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
- * by itself (in which case 'unused_lebs' would be a better name). In the case
- * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
- * but unlike other empty LEBs that are 'taken', it may not be written straight
- * away (i.e. before the next commit start or unmount), so either gc_lnum must
- * be specially accounted for, or the current approach followed i.e. count it
- * under 'taken_empty_lebs'.
 */
 struct ubifs_lp_stats {
        int empty_lebs;
@@ -893,15 +903,25 @@ struct ubifs_orphan {
 /**
 * struct ubifs_mount_opts - UBIFS-specific mount options information.
 * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
- * @bulk_read: enable bulk-reads
+ * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable)
- * @chk_data_crc: check CRCs when reading data nodes
+ * @chk_data_crc: enable/disable CRC data checking when reading data nodes
+ *                (%0 default, %1 disabe, %2 enable)
+ * @override_compr: override default compressor (%0 - do not override and use
+ *                  superblock compressor, %1 - override and use compressor
+ *                  specified in @compr_type)
+ * @compr_type: compressor type to override the superblock compressor with
+ *              (%UBIFS_COMPR_NONE, etc)
 */
 struct ubifs_mount_opts {
        unsigned int unmount_mode:2;
        unsigned int bulk_read:2;
        unsigned int chk_data_crc:2;
+        unsigned int override_compr:1;
+        unsigned int compr_type:2;
 };
+struct ubifs_debug_info;
 /**
 * struct ubifs_info - UBIFS file-system description data structure
 * (per-superblock).
@@ -941,11 +961,11 @@ struct ubifs_mount_opts {
 * @cs_lock: commit state lock
 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
 *
- * @fast_unmount: do not run journal commit before un-mounting
 * @big_lpt: flag that LPT is too big to write whole during commit
 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
 *                   recovery)
 * @bulk_read: enable bulk-reads
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
 *
 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
 *             @calc_idx_sz
@@ -963,8 +983,6 @@ struct ubifs_mount_opts {
 * @ileb_nxt: next pre-allocated index LEBs
 * @old_idx: tree of index nodes obsoleted since the last commit start
 * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
- * @new_ihead_lnum: used by debugging to check ihead_lnum
- * @new_ihead_offs: used by debugging to check ihead_offs
 *
 * @mst_node: master node
 * @mst_offs: offset of valid master node
@@ -986,7 +1004,6 @@ struct ubifs_mount_opts {
 * @main_lebs: count of LEBs in the main area
 * @main_first: first LEB of the main area
 * @main_bytes: main area size in bytes
- * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
 *
 * @key_hash_type: type of the key hash
 * @key_hash: direntry key hash function
@@ -1149,15 +1166,7 @@ struct ubifs_mount_opts {
 * @always_chk_crc: always check CRCs (while mounting and remounting rw)
 * @mount_opts: UBIFS-specific mount options
 *
- * @dbg_buf: a buffer of LEB size used for debugging purposes
+ * @dbg: debugging-related information
- * @old_zroot: old index root - used by 'dbg_check_old_index()'
- * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
- * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
- * @failure_mode: failure mode for recovery testing
- * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
- * @fail_timeout: time in jiffies when delay of failure mode expires
- * @fail_cnt: current number of calls to failure mode I/O functions
- * @fail_cnt_max: number of calls by which to delay failure mode
 */
 struct ubifs_info {
        struct super_block *vfs_sb;
@@ -1192,10 +1201,10 @@ struct ubifs_info {
        spinlock_t cs_lock;
        wait_queue_head_t cmt_wq;
-        unsigned int fast_unmount:1;
        unsigned int big_lpt:1;
        unsigned int no_chk_data_crc:1;
        unsigned int bulk_read:1;
+        unsigned int default_compr:2;
        struct mutex tnc_mutex;
        struct ubifs_zbranch zroot;
@@ -1212,10 +1221,6 @@ struct ubifs_info {
        int ileb_nxt;
        struct rb_root old_idx;
        int *bottom_up_buf;
-#ifdef CONFIG_UBIFS_FS_DEBUG
-        int new_ihead_lnum;
-        int new_ihead_offs;
-#endif
        struct ubifs_mst_node *mst_node;
        int mst_offs;
@@ -1237,7 +1242,6 @@ struct ubifs_info {
        int main_lebs;
        int main_first;
        long long main_bytes;
-        int default_compr;
        uint8_t key_hash_type;
        uint32_t (*key_hash)(const char *str, int len);
@@ -1315,8 +1319,8 @@ struct ubifs_info {
        void *sbuf;
        struct list_head idx_gc;
        int idx_gc_cnt;
-        volatile int gc_seq;
+        int gc_seq;
-        volatile int gced_lnum;
+        int gced_lnum;
        struct list_head infos_list;
        struct mutex umount_mutex;
@@ -1391,21 +1395,7 @@ struct ubifs_info {
        struct ubifs_mount_opts mount_opts;
 #ifdef CONFIG_UBIFS_FS_DEBUG
-        void *dbg_buf;
+        struct ubifs_debug_info *dbg;
-        struct ubifs_zbranch old_zroot;
-        int old_zroot_level;
-        unsigned long long old_zroot_sqnum;
-        int failure_mode;
-        int fail_delay;
-        unsigned long fail_timeout;
-        unsigned int fail_cnt;
-        unsigned int fail_cnt_max;
-        long long chk_lpt_sz;
-        long long chk_lpt_sz2;
-        long long chk_lpt_wastage;
-        int chk_lpt_lebs;
-        int new_nhead_lnum;
-        int new_nhead_offs;
 #endif
 };
@@ -1413,13 +1403,13 @@ extern struct list_head ubifs_infos;
 extern spinlock_t ubifs_infos_lock;
 extern atomic_long_t ubifs_clean_zn_cnt;
 extern struct kmem_cache *ubifs_inode_slab;
-extern struct super_operations ubifs_super_operations;
+extern const struct super_operations ubifs_super_operations;
-extern struct address_space_operations ubifs_file_address_operations;
+extern const struct address_space_operations ubifs_file_address_operations;
-extern struct file_operations ubifs_file_operations;
+extern const struct file_operations ubifs_file_operations;
-extern struct inode_operations ubifs_file_inode_operations;
+extern const struct inode_operations ubifs_file_inode_operations;
-extern struct file_operations ubifs_dir_operations;
+extern const struct file_operations ubifs_dir_operations;
-extern struct inode_operations ubifs_dir_inode_operations;
+extern const struct inode_operations ubifs_dir_inode_operations;
-extern struct inode_operations ubifs_symlink_inode_operations;
+extern const struct inode_operations ubifs_symlink_inode_operations;
 extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
@@ -1436,7 +1426,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
                     int offs, int dtype);
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-                     int offs, int quiet, int chk_crc);
+                     int offs, int quiet, int must_chk_crc);
 void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
 void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
 int ubifs_io_init(struct ubifs_info *c);
@@ -1503,9 +1493,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
 void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
                         struct ubifs_budget_req *req);
 long long ubifs_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space_nolock(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
+long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 /* find.c */
@@ -1611,6 +1602,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
 int ubifs_orphan_start_commit(struct ubifs_info *c);
 int ubifs_orphan_end_commit(struct ubifs_info *c);
 int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+int ubifs_clear_orphans(struct ubifs_info *c);
 /* lpt.c */
 int ubifs_calc_lpt_geom(struct ubifs_info *c);
@@ -1639,6 +1631,9 @@ void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
 void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
 uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
 struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+/* Needed only in debugging code in lpt_commit.c */
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+                       struct ubifs_nnode *nnode);
 /* lpt_commit.c */
 int ubifs_lpt_start_commit(struct ubifs_info *c);
@@ -1651,7 +1646,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
                                           const struct ubifs_lprops *lp,
                                           int free, int dirty, int flags,
                                           int idx_gc_cnt);
-void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
 void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
                      int cat);
 void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
@@ -1714,7 +1709,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 /* compressor.c */
 int __init ubifs_compressors_init(void);
-void __exit ubifs_compressors_exit(void);
+void ubifs_compressors_exit(void);
 void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
                    int *compr_type);
 int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
new file mode 100644
index 000000000000..0e0e99bd6bce
--- /dev/null
+++ b/fs/udf/Kconfig
@@ -0,0 +1,18 @@
+config UDF_FS
+        tristate "UDF file system support"
+        select CRC_ITU_T
+        help
+          This is the new file system used on some CD-ROMs and DVDs. Say Y if
+          you intend to mount DVD discs or CDRW's written in packet mode, or
+          if written to by other UDF utilities, such as DirectCD.
+          Please read <file:Documentation/filesystems/udf.txt>.
+          To compile this file system support as a module, choose M here: the
+          module will be called udf.
+          If unsure, say N.
+config UDF_NLS
+        bool
+        default y
+        depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
new file mode 100644
index 000000000000..e4f10a40768a
--- /dev/null
+++ b/fs/ufs/Kconfig
@@ -0,0 +1,43 @@
+config UFS_FS
+        tristate "UFS file system support (read only)"
+        depends on BLOCK
+        help
+          BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
+          OpenBSD and NeXTstep) use a file system called UFS. Some System V
+          Unixes can create and mount hard disk partitions and diskettes using
+          this file system as well. Saying Y here will allow you to read from
+          these partitions; if you also want to write to them, say Y to the
+          experimental "UFS file system write support", below. Please read the
+          file <file:Documentation/filesystems/ufs.txt> for more information.
+          The recently released UFS2 variant (used in FreeBSD 5.x) is
+          READ-ONLY supported.
+          Note that this option is generally not needed for floppies, since a
+          good portable way to transport files and directories between unixes
+          (and even other operating systems) is given by the tar program ("man
+          tar" or preferably "info tar").
+          When accessing NeXTstep files, you may need to convert them from the
+          NeXT character set to the Latin1 character set; use the program
+          recode ("info recode") for this purpose.
+          To compile the UFS file system support as a module, choose M here: the
+          module will be called ufs.
+          If you haven't heard about all of this before, it's safe to say N.
+config UFS_FS_WRITE
+        bool "UFS file system write support (DANGEROUS)"
+        depends on UFS_FS && EXPERIMENTAL
+        help
+          Say Y here if you want to try writing to UFS partitions. This is
+          experimental, so you should back up your UFS partitions beforehand.
+config UFS_DEBUG
+        bool "UFS debugging"
+        depends on UFS_FS
+        help
+          If you are experiencing any problems with the UFS filesystem, say
+          Y here.  This will result in _many_ additional debugging messages to be
+          written to the system log.
diff --git a/fs/utimes.c b/fs/utimes.c
index 6929e3e91d05..e4c75db5d373 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -24,7 +24,7 @@
 * must be owner or have write permission.
 * Else, update from *times, must be owner or super user.
 */
-asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
+SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
 {
        struct timespec tv[2];
@@ -170,7 +170,8 @@ out:
        return error;
 }
-asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __user *utimes, int flags)
+SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
+                struct timespec __user *, utimes, int, flags)
 {
        struct timespec tstimes[2];
@@ -187,7 +188,8 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
        return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
 }
-asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes)
+SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename,
+                struct timeval __user *, utimes)
 {
        struct timeval times[2];
        struct timespec tstimes[2];
@@ -214,7 +216,8 @@ asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __u
        return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0);
 }
-asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes)
+SYSCALL_DEFINE2(utimes, char __user *, filename,
+                struct timeval __user *, utimes)
 {
        return sys_futimesat(AT_FDCWD, filename, utimes);
 }
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e66531..197c4fcac032 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
        if (error)
                return error;
        error = -EOPNOTSUPP;
-        if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+        if (d->d_inode->i_op->listxattr) {
                error = d->d_inode->i_op->listxattr(d, list, size);
        } else {
                error = security_inode_listsecurity(d->d_inode, list, size);
@@ -251,9 +251,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
        return error;
 }
-asmlinkage long
+SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
-sys_setxattr(const char __user *pathname, const char __user *name,
+                const char __user *, name, const void __user *, value,
-             const void __user *value, size_t size, int flags)
+                size_t, size, int, flags)
 {
        struct path path;
        int error;
@@ -270,9 +270,9 @@ sys_setxattr(const char __user *pathname, const char __user *name,
        return error;
 }
-asmlinkage long
+SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
-sys_lsetxattr(const char __user *pathname, const char __user *name,
+                const char __user *, name, const void __user *, value,
-              const void __user *value, size_t size, int flags)
+                size_t, size, int, flags)
 {
        struct path path;
        int error;
@@ -289,9 +289,8 @@ sys_lsetxattr(const char __user *pathname, const char __user *name,
        return error;
 }
-asmlinkage long
+SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
-sys_fsetxattr(int fd, const char __user *name, const void __user *value,
+                const void __user *,value, size_t, size, int, flags)
-              size_t size, int flags)
 {
        struct file *f;
        struct dentry *dentry;
@@ -349,9 +348,8 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
        return error;
 }
-asmlinkage ssize_t
+SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
-sys_getxattr(const char __user *pathname, const char __user *name,
+                const char __user *, name, void __user *, value, size_t, size)
-             void __user *value, size_t size)
 {
        struct path path;
        ssize_t error;
@@ -364,9 +362,8 @@ sys_getxattr(const char __user *pathname, const char __user *name,
        return error;
 }
-asmlinkage ssize_t
+SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
-sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value,
+                const char __user *, name, void __user *, value, size_t, size)
-              size_t size)
 {
        struct path path;
        ssize_t error;
@@ -379,8 +376,8 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user
        return error;
 }
-asmlinkage ssize_t
+SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
-sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size)
+                void __user *, value, size_t, size)
 {
        struct file *f;
        ssize_t error = -EBADF;
@@ -424,8 +421,8 @@ listxattr(struct dentry *d, char __user *list, size_t size)
        return error;
 }
-asmlinkage ssize_t
+SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
-sys_listxattr(const char __user *pathname, char __user *list, size_t size)
+                size_t, size)
 {
        struct path path;
        ssize_t error;
@@ -438,8 +435,8 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size)
        return error;
 }
-asmlinkage ssize_t
+SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
-sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
+                size_t, size)
 {
        struct path path;
        ssize_t error;
@@ -452,8 +449,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
        return error;
 }
-asmlinkage ssize_t
+SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
-sys_flistxattr(int fd, char __user *list, size_t size)
 {
        struct file *f;
        ssize_t error = -EBADF;
@@ -485,8 +481,8 @@ removexattr(struct dentry *d, const char __user *name)
        return vfs_removexattr(d, kname);
 }
-asmlinkage long
+SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
-sys_removexattr(const char __user *pathname, const char __user *name)
+                const char __user *, name)
 {
        struct path path;
        int error;
@@ -503,8 +499,8 @@ sys_removexattr(const char __user *pathname, const char __user *name)
        return error;
 }
-asmlinkage long
+SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
-sys_lremovexattr(const char __user *pathname, const char __user *name)
+                const char __user *, name)
 {
        struct path path;
        int error;
@@ -521,8 +517,7 @@ sys_lremovexattr(const char __user *pathname, const char __user *name)
        return error;
 }
-asmlinkage long
+SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
-sys_fremovexattr(int fd, const char __user *name)
 {
        struct file *f;
        struct dentry *dentry;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 3f53dd101f99..29228f5899cd 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,7 @@
 config XFS_FS
        tristate "XFS filesystem support"
        depends on BLOCK
+        select EXPORTFS
        help
          XFS is a high performance journaling filesystem which originated
          on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 7b26f5ff9692..1dd528849755 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -21,8 +21,6 @@
 extern struct workqueue_struct *xfsdatad_workqueue;
 extern mempool_t *xfs_ioend_pool;
-typedef void (*xfs_ioend_func_t)(void *);
 /*
 * xfs_ioend struct manages large extent writes for XFS.
 * It can manage several multi-page bio's at once.
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index cb329edc925b..d71dc44e21ed 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -166,75 +166,6 @@ test_page_region(
 }
 /*
- *      Mapping of multi-page buffers into contiguous virtual space
- */
-typedef struct a_list {
-        void            *vm_addr;
-        struct a_list   *next;
-} a_list_t;
-static a_list_t         *as_free_head;
-static int              as_list_len;
-static DEFINE_SPINLOCK(as_lock);
-/*
- *      Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
-        void            *addr)
-{
-        a_list_t        *aentry;
-#ifdef CONFIG_XEN
-        /*
-         * Xen needs to be able to make sure it can get an exclusive
-         * RO mapping of pages it wants to turn into a pagetable.  If
-         * a newly allocated page is also still being vmap()ed by xfs,
-         * it will cause pagetable construction to fail.  This is a
-         * quick workaround to always eagerly unmap pages so that Xen
-         * is happy.
-         */
-        vunmap(addr);
-        return;
-#endif
-        aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
-        if (likely(aentry)) {
-                spin_lock(&as_lock);
-                aentry->next = as_free_head;
-                aentry->vm_addr = addr;
-                as_free_head = aentry;
-                as_list_len++;
-                spin_unlock(&as_lock);
-        } else {
-                vunmap(addr);
-        }
-}
-STATIC void
-purge_addresses(void)
-{
-        a_list_t        *aentry, *old;
-        if (as_free_head == NULL)
-                return;
-        spin_lock(&as_lock);
-        aentry = as_free_head;
-        as_free_head = NULL;
-        as_list_len = 0;
-        spin_unlock(&as_lock);
-        while ((old = aentry) != NULL) {
-                vunmap(aentry->vm_addr);
-                aentry = aentry->next;
-                kfree(old);
-        }
-}
-/*
 *      Internal xfs_buf_t object manipulation
 */
@@ -333,7 +264,7 @@ xfs_buf_free(
                uint            i;
                if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                        free_address(bp->b_addr - bp->b_offset);
+                       vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
                for (i = 0; i < bp->b_page_count; i++) {
                        struct page     *page = bp->b_pages[i];
@@ -455,10 +386,8 @@ _xfs_buf_map_pages(
                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
        } else if (flags & XBF_MAPPED) {
-                if (as_list_len > 64)
+               bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                        purge_addresses();
+                                       -1, PAGE_KERNEL);
-                bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
-                                        VM_MAP, PAGE_KERNEL);
                if (unlikely(bp->b_addr == NULL))
                        return -ENOMEM;
                bp->b_addr += bp->b_offset;
@@ -1743,8 +1672,6 @@ xfsbufd(
                        count++;
                }
-                if (as_list_len > 0)
-                        purge_addresses();
                if (count)
                        blk_run_address_space(target->bt_mapping);
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 595751f78350..87b8cbd23d4b 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -126,11 +126,26 @@ xfs_nfs_get_inode(
        if (ino == 0)
                return ERR_PTR(-ESTALE);
-        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+        /*
-        if (error)
+         * The XFS_IGET_BULKSTAT means that an invalid inode number is just
+         * fine and not an indication of a corrupted filesystem.  Because
+         * clients can send any kind of invalid file handle, e.g. after
+         * a restore on the server we have to deal with this case gracefully.
+         */
+        error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT,
+                         XFS_ILOCK_SHARED, &ip, 0);
+        if (error) {
+                /*
+                 * EINVAL means the inode cluster doesn't exist anymore.
+                 * This implies the filehandle is stale, so we should
+                 * translate it here.
+                 * We don't use ESTALE directly down the chain to not
+                 * confuse applications using bulkstat that expect EINVAL.
+                 */
+                if (error == EINVAL)
+                        error = ESTALE;
                return ERR_PTR(-error);
-        if (!ip)
+        }
-                return ERR_PTR(-EIO);
        if (ip->i_d.di_gen != generation) {
                xfs_iput_new(ip, XFS_ILOCK_SHARED);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 67205f6198ba..4bd112313f33 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -50,12 +50,14 @@
 #include "xfs_vnodeops.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
+#include "xfs_export.h"
 #include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/exportfs.h>
 /*
 * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -164,97 +166,69 @@ xfs_find_handle(
        return 0;
 }
 /*
- * Convert userspace handle data into inode.
+ * No need to do permission checks on the various pathname components
- *
+ * as the handle operations are privileged.
- * We use the fact that all the fsop_handlereq ioctl calls have a data
- * structure argument whose first component is always a xfs_fsop_handlereq_t,
- * so we can pass that sub structure into this handy, shared routine.
- *
- * If no error, caller must always iput the returned inode.
 */
 STATIC int
-xfs_vget_fsop_handlereq(
+xfs_handle_acceptable(
-        xfs_mount_t             *mp,
+        void                    *context,
-        struct inode            *parinode,      /* parent inode pointer    */
+        struct dentry           *dentry)
-        xfs_fsop_handlereq_t    *hreq,
+{
-        struct inode            **inode)
+        return 1;
+}
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+        struct file             *parfilp,
+        void __user             *uhandle,
+        u32                     hlen)
 {
-        void                    __user *hanp;
-        size_t                  hlen;
-        xfs_fid_t               *xfid;
-        xfs_handle_t            *handlep;
        xfs_handle_t            handle;
-        xfs_inode_t             *ip;
+        struct xfs_fid64        fid;
-        xfs_ino_t               ino;
-        __u32                   igen;
-        int                     error;
        /*
         * Only allow handle opens under a directory.
         */
-        if (!S_ISDIR(parinode->i_mode))
+        if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
-                return XFS_ERROR(ENOTDIR);
+                return ERR_PTR(-ENOTDIR);
-        hanp = hreq->ihandle;
+        if (hlen != sizeof(xfs_handle_t))
-        hlen = hreq->ihandlen;
+                return ERR_PTR(-EINVAL);
-        handlep = &handle;
+        if (copy_from_user(&handle, uhandle, hlen))
+                return ERR_PTR(-EFAULT);
-        if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+        if (handle.ha_fid.fid_len !=
-                return XFS_ERROR(EINVAL);
+            sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
-        if (copy_from_user(handlep, hanp, hlen))
+                return ERR_PTR(-EINVAL);
-                return XFS_ERROR(EFAULT);
-        if (hlen < sizeof(*handlep))
+        memset(&fid, 0, sizeof(struct fid));
-                memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
+        fid.ino = handle.ha_fid.fid_ino;
-        if (hlen > sizeof(handlep->ha_fsid)) {
+        fid.gen = handle.ha_fid.fid_gen;
-                if (handlep->ha_fid.fid_len !=
-                    (hlen - sizeof(handlep->ha_fsid) -
+        return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
-                            sizeof(handlep->ha_fid.fid_len)) ||
+                        FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
-                    handlep->ha_fid.fid_pad)
+                        xfs_handle_acceptable, NULL);
-                        return XFS_ERROR(EINVAL);
+}
-        }
-        /*
-         * Crack the handle, obtain the inode # & generation #
-         */
-        xfid = (struct xfs_fid *)&handlep->ha_fid;
-        if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
-                ino  = xfid->fid_ino;
-                igen = xfid->fid_gen;
-        } else {
-                return XFS_ERROR(EINVAL);
-        }
-        /*
-         * Get the XFS inode, building a Linux inode to go with it.
-         */
-        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
-        if (error)
-                return error;
-        if (ip == NULL)
-                return XFS_ERROR(EIO);
-        if (ip->i_d.di_gen != igen) {
-                xfs_iput_new(ip, XFS_ILOCK_SHARED);
-                return XFS_ERROR(ENOENT);
-        }
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        *inode = VFS_I(ip);
+STATIC struct dentry *
-        return 0;
+xfs_handlereq_to_dentry(
+        struct file             *parfilp,
+        xfs_fsop_handlereq_t    *hreq)
+{
+        return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
 }
 int
 xfs_open_by_handle(
-        xfs_mount_t             *mp,
-        xfs_fsop_handlereq_t    *hreq,
        struct file             *parfilp,
-        struct inode            *parinode)
+        xfs_fsop_handlereq_t    *hreq)
 {
        const struct cred       *cred = current_cred();
        int                     error;
-        int                     new_fd;
+        int                     fd;
        int                     permflag;
        struct file             *filp;
        struct inode            *inode;
@@ -263,19 +237,21 @@ xfs_open_by_handle(
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-        error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
+        dentry = xfs_handlereq_to_dentry(parfilp, hreq);
-        if (error)
+        if (IS_ERR(dentry))
-                return -error;
+                return PTR_ERR(dentry);
+        inode = dentry->d_inode;
        /* Restrict xfs_open_by_handle to directories & regular files. */
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
-                iput(inode);
+                error = -XFS_ERROR(EPERM);
-                return -XFS_ERROR(EINVAL);
+                goto out_dput;
        }
 #if BITS_PER_LONG != 32
        hreq->oflags |= O_LARGEFILE;
 #endif
        /* Put open permission in namei format. */
        permflag = hreq->oflags;
        if ((permflag+1) & O_ACCMODE)
@@ -285,50 +261,45 @@ xfs_open_by_handle(
        if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
            (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
-                iput(inode);
+                error = -XFS_ERROR(EPERM);
-                return -XFS_ERROR(EPERM);
+                goto out_dput;
        }
        if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
-                iput(inode);
+                error = -XFS_ERROR(EACCES);
-                return -XFS_ERROR(EACCES);
+                goto out_dput;
        }
        /* Can't write directories. */
-        if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+        if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
-                iput(inode);
+                error = -XFS_ERROR(EISDIR);
-                return -XFS_ERROR(EISDIR);
+                goto out_dput;
-        }
-        if ((new_fd = get_unused_fd()) < 0) {
-                iput(inode);
-                return new_fd;
        }
-        dentry = d_obtain_alias(inode);
+        fd = get_unused_fd();
-        if (IS_ERR(dentry)) {
+        if (fd < 0) {
-                put_unused_fd(new_fd);
+                error = fd;
-                return PTR_ERR(dentry);
+                goto out_dput;
        }
-        /* Ensure umount returns EBUSY on umounts while this file is open. */
+        filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
-        mntget(parfilp->f_path.mnt);
+                           hreq->oflags, cred);
-        /* Create file pointer. */
-        filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
        if (IS_ERR(filp)) {
-                put_unused_fd(new_fd);
+                put_unused_fd(fd);
-                return -XFS_ERROR(-PTR_ERR(filp));
+                return PTR_ERR(filp);
        }
        if (inode->i_mode & S_IFREG) {
-                /* invisible operation should not change atime */
                filp->f_flags |= O_NOATIME;
                filp->f_mode |= FMODE_NOCMTIME;
        }
-        fd_install(new_fd, filp);
+        fd_install(fd, filp);
-        return new_fd;
+        return fd;
+ out_dput:
+        dput(dentry);
+        return error;
 }
 /*
@@ -359,11 +330,10 @@ do_readlink(
 int
 xfs_readlink_by_handle(
-        xfs_mount_t             *mp,
+        struct file             *parfilp,
-        xfs_fsop_handlereq_t    *hreq,
+        xfs_fsop_handlereq_t    *hreq)
-        struct inode            *parinode)
 {
-        struct inode            *inode;
+        struct dentry           *dentry;
        __u32                   olen;
        void                    *link;
        int                     error;
@@ -371,26 +341,28 @@ xfs_readlink_by_handle(
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-        error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
+        dentry = xfs_handlereq_to_dentry(parfilp, hreq);
-        if (error)
+        if (IS_ERR(dentry))
-                return -error;
+                return PTR_ERR(dentry);
        /* Restrict this handle operation to symlinks only. */
-        if (!S_ISLNK(inode->i_mode)) {
+        if (!S_ISLNK(dentry->d_inode->i_mode)) {
                error = -XFS_ERROR(EINVAL);
-                goto out_iput;
+                goto out_dput;
        }
        if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
                error = -XFS_ERROR(EFAULT);
-                goto out_iput;
+                goto out_dput;
        }
        link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
-        if (!link)
+        if (!link) {
-                goto out_iput;
+                error = -XFS_ERROR(ENOMEM);
+                goto out_dput;
+        }
-        error = -xfs_readlink(XFS_I(inode), link);
+        error = -xfs_readlink(XFS_I(dentry->d_inode), link);
        if (error)
                goto out_kfree;
        error = do_readlink(hreq->ohandle, olen, link);
@@ -399,32 +371,31 @@ xfs_readlink_by_handle(
 out_kfree:
        kfree(link);
- out_iput:
+ out_dput:
-        iput(inode);
+        dput(dentry);
        return error;
 }
 STATIC int
 xfs_fssetdm_by_handle(
-        xfs_mount_t             *mp,
+        struct file             *parfilp,
-        void                    __user *arg,
+        void                    __user *arg)
-        struct inode            *parinode)
 {
        int                     error;
        struct fsdmidata        fsd;
        xfs_fsop_setdm_handlereq_t dmhreq;
-        struct inode            *inode;
+        struct dentry           *dentry;
        if (!capable(CAP_MKNOD))
                return -XFS_ERROR(EPERM);
        if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &inode);
+        dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
-        if (error)
+        if (IS_ERR(dentry))
-                return -error;
+                return PTR_ERR(dentry);
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+        if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
                error = -XFS_ERROR(EPERM);
                goto out;
        }
@@ -434,24 +405,23 @@ xfs_fssetdm_by_handle(
                goto out;
        }
-        error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+        error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
                                 fsd.fsd_dmstate);
 out:
-        iput(inode);
+        dput(dentry);
        return error;
 }
 STATIC int
 xfs_attrlist_by_handle(
-        xfs_mount_t             *mp,
+        struct file             *parfilp,
-        void                    __user *arg,
+        void                    __user *arg)
-        struct inode            *parinode)
 {
-        int                     error;
+        int                     error = -ENOMEM;
        attrlist_cursor_kern_t  *cursor;
        xfs_fsop_attrlist_handlereq_t al_hreq;
-        struct inode            *inode;
+        struct dentry           *dentry;
        char                    *kbuf;
        if (!capable(CAP_SYS_ADMIN))
@@ -467,16 +437,16 @@ xfs_attrlist_by_handle(
        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
                return -XFS_ERROR(EINVAL);
-        error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode);
+        dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
-        if (error)
+        if (IS_ERR(dentry))
-                goto out;
+                return PTR_ERR(dentry);
        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
        if (!kbuf)
-                goto out_vn_rele;
+                goto out_dput;
        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-        error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+        error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
                                        al_hreq.flags, cursor);
        if (error)
                goto out_kfree;
@@ -486,10 +456,9 @@ xfs_attrlist_by_handle(
 out_kfree:
        kfree(kbuf);
- out_vn_rele:
+ out_dput:
-        iput(inode);
+        dput(dentry);
- out:
+        return error;
-        return -error;
 }
 int
@@ -564,15 +533,13 @@ xfs_attrmulti_attr_remove(
 STATIC int
 xfs_attrmulti_by_handle(
-        xfs_mount_t             *mp,
-        void                    __user *arg,
        struct file             *parfilp,
-        struct inode            *parinode)
+        void                    __user *arg)
 {
        int                     error;
        xfs_attr_multiop_t      *ops;
        xfs_fsop_attrmulti_handlereq_t am_hreq;
-        struct inode            *inode;
+        struct dentry           *dentry;
        unsigned int            i, size;
        char                    *attr_name;
@@ -581,19 +548,19 @@ xfs_attrmulti_by_handle(
        if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &inode);
+        dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
-        if (error)
+        if (IS_ERR(dentry))
-                goto out;
+                return PTR_ERR(dentry);
        error = E2BIG;
        size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
        if (!size || size > 16 * PAGE_SIZE)
-                goto out_vn_rele;
+                goto out_dput;
        error = ENOMEM;
        ops = kmalloc(size, GFP_KERNEL);
        if (!ops)
-                goto out_vn_rele;
+                goto out_dput;
        error = EFAULT;
        if (copy_from_user(ops, am_hreq.ops, size))
@@ -615,25 +582,28 @@ xfs_attrmulti_by_handle(
                switch (ops[i].am_opcode) {
                case ATTR_OP_GET:
-                        ops[i].am_error = xfs_attrmulti_attr_get(inode,
+                        ops[i].am_error = xfs_attrmulti_attr_get(
-                                        attr_name, ops[i].am_attrvalue,
+                                        dentry->d_inode, attr_name,
-                                        &ops[i].am_length, ops[i].am_flags);
+                                        ops[i].am_attrvalue, &ops[i].am_length,
+                                        ops[i].am_flags);
                        break;
                case ATTR_OP_SET:
                        ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
                        if (ops[i].am_error)
                                break;
-                        ops[i].am_error = xfs_attrmulti_attr_set(inode,
+                        ops[i].am_error = xfs_attrmulti_attr_set(
-                                        attr_name, ops[i].am_attrvalue,
+                                        dentry->d_inode, attr_name,
-                                        ops[i].am_length, ops[i].am_flags);
+                                        ops[i].am_attrvalue, ops[i].am_length,
+                                        ops[i].am_flags);
                        mnt_drop_write(parfilp->f_path.mnt);
                        break;
                case ATTR_OP_REMOVE:
                        ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
                        if (ops[i].am_error)
                                break;
-                        ops[i].am_error = xfs_attrmulti_attr_remove(inode,
+                        ops[i].am_error = xfs_attrmulti_attr_remove(
-                                        attr_name, ops[i].am_flags);
+                                        dentry->d_inode, attr_name,
+                                        ops[i].am_flags);
                        mnt_drop_write(parfilp->f_path.mnt);
                        break;
                default:
@@ -647,9 +617,8 @@ xfs_attrmulti_by_handle(
        kfree(attr_name);
 out_kfree_ops:
        kfree(ops);
- out_vn_rele:
+ out_dput:
-        iput(inode);
+        dput(dentry);
- out:
        return -error;
 }
@@ -1440,23 +1409,23 @@ xfs_file_ioctl(
                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
                        return -XFS_ERROR(EFAULT);
-                return xfs_open_by_handle(mp, &hreq, filp, inode);
+                return xfs_open_by_handle(filp, &hreq);
        }
        case XFS_IOC_FSSETDM_BY_HANDLE:
-                return xfs_fssetdm_by_handle(mp, arg, inode);
+                return xfs_fssetdm_by_handle(filp, arg);
        case XFS_IOC_READLINK_BY_HANDLE: {
                xfs_fsop_handlereq_t    hreq;
                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
                        return -XFS_ERROR(EFAULT);
-                return xfs_readlink_by_handle(mp, &hreq, inode);
+                return xfs_readlink_by_handle(filp, &hreq);
        }
        case XFS_IOC_ATTRLIST_BY_HANDLE:
-                return xfs_attrlist_by_handle(mp, arg, inode);
+                return xfs_attrlist_by_handle(filp, arg);
        case XFS_IOC_ATTRMULTI_BY_HANDLE:
-                return xfs_attrmulti_by_handle(mp, arg, filp, inode);
+                return xfs_attrmulti_by_handle(filp, arg);
        case XFS_IOC_SWAPEXT: {
                struct xfs_swapext      sxp;
@@ -1546,21 +1515,6 @@ xfs_file_ioctl(
                return -error;
        }
-        case XFS_IOC_FREEZE:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (inode->i_sb->s_frozen == SB_UNFROZEN)
-                        freeze_bdev(inode->i_sb->s_bdev);
-                return 0;
-        case XFS_IOC_THAW:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (inode->i_sb->s_frozen != SB_UNFROZEN)
-                        thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
-                return 0;
        case XFS_IOC_GOINGDOWN: {
                __uint32_t in;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 8c16bf2d7e03..7bd7c6afc1eb 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -34,16 +34,13 @@ xfs_find_handle(
 extern int
 xfs_open_by_handle(
-        xfs_mount_t             *mp,
-        xfs_fsop_handlereq_t    *hreq,
        struct file             *parfilp,
-        struct inode            *parinode);
+        xfs_fsop_handlereq_t    *hreq);
 extern int
 xfs_readlink_by_handle(
-        xfs_mount_t             *mp,
+        struct file             *parfilp,
-        xfs_fsop_handlereq_t    *hreq,
+        xfs_fsop_handlereq_t    *hreq);
-        struct inode            *parinode);
 extern int
 xfs_attrmulti_attr_get(
@@ -67,6 +64,12 @@ xfs_attrmulti_attr_remove(
        char                    *name,
        __uint32_t              flags);
+extern struct dentry *
+xfs_handle_to_dentry(
+        struct file             *parfilp,
+        void __user             *uhandle,
+        u32                     hlen);
 extern long
 xfs_file_ioctl(
        struct file             *filp,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0504cece9f66..c70c4e3db790 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -17,6 +17,7 @@
 */
 #include <linux/compat.h>
 #include <linux/ioctl.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -340,96 +341,24 @@ xfs_compat_handlereq_copyin(
        return 0;
 }
-/*
+STATIC struct dentry *
- * Convert userspace handle data into inode.
+xfs_compat_handlereq_to_dentry(
- *
+        struct file             *parfilp,
- * We use the fact that all the fsop_handlereq ioctl calls have a data
+        compat_xfs_fsop_handlereq_t *hreq)
- * structure argument whose first component is always a xfs_fsop_handlereq_t,
- * so we can pass that sub structure into this handy, shared routine.
- *
- * If no error, caller must always iput the returned inode.
- */
-STATIC int
-xfs_vget_fsop_handlereq_compat(
-        xfs_mount_t             *mp,
-        struct inode            *parinode,      /* parent inode pointer    */
-        compat_xfs_fsop_handlereq_t     *hreq,
-        struct inode            **inode)
 {
-        void                    __user *hanp;
+        return xfs_handle_to_dentry(parfilp,
-        size_t                  hlen;
+                        compat_ptr(hreq->ihandle), hreq->ihandlen);
-        xfs_fid_t               *xfid;
-        xfs_handle_t            *handlep;
-        xfs_handle_t            handle;
-        xfs_inode_t             *ip;
-        xfs_ino_t               ino;
-        __u32                   igen;
-        int                     error;
-        /*
-         * Only allow handle opens under a directory.
-         */
-        if (!S_ISDIR(parinode->i_mode))
-                return XFS_ERROR(ENOTDIR);
-        hanp = compat_ptr(hreq->ihandle);
-        hlen = hreq->ihandlen;
-        handlep = &handle;
-        if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
-                return XFS_ERROR(EINVAL);
-        if (copy_from_user(handlep, hanp, hlen))
-                return XFS_ERROR(EFAULT);
-        if (hlen < sizeof(*handlep))
-                memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
-        if (hlen > sizeof(handlep->ha_fsid)) {
-                if (handlep->ha_fid.fid_len !=
-                    (hlen - sizeof(handlep->ha_fsid) -
-                            sizeof(handlep->ha_fid.fid_len)) ||
-                    handlep->ha_fid.fid_pad)
-                        return XFS_ERROR(EINVAL);
-        }
-        /*
-         * Crack the handle, obtain the inode # & generation #
-         */
-        xfid = (struct xfs_fid *)&handlep->ha_fid;
-        if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
-                ino  = xfid->fid_ino;
-                igen = xfid->fid_gen;
-        } else {
-                return XFS_ERROR(EINVAL);
-        }
-        /*
-         * Get the XFS inode, building a Linux inode to go with it.
-         */
-        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
-        if (error)
-                return error;
-        if (ip == NULL)
-                return XFS_ERROR(EIO);
-        if (ip->i_d.di_gen != igen) {
-                xfs_iput_new(ip, XFS_ILOCK_SHARED);
-                return XFS_ERROR(ENOENT);
-        }
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        *inode = VFS_I(ip);
-        return 0;
 }
 STATIC int
 xfs_compat_attrlist_by_handle(
-        xfs_mount_t             *mp,
+        struct file             *parfilp,
-        void                    __user *arg,
+        void                    __user *arg)
-        struct inode            *parinode)
 {
        int                     error;
        attrlist_cursor_kern_t  *cursor;
        compat_xfs_fsop_attrlist_handlereq_t al_hreq;
-        struct inode            *inode;
+        struct dentry           *dentry;
        char                    *kbuf;
        if (!capable(CAP_SYS_ADMIN))
@@ -446,17 +375,17 @@ xfs_compat_attrlist_by_handle(
        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
                return -XFS_ERROR(EINVAL);
-        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq,
+        dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
-                                               &inode);
+        if (IS_ERR(dentry))
-        if (error)
+                return PTR_ERR(dentry);
-                goto out;
+        error = -ENOMEM;
        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
        if (!kbuf)
-                goto out_vn_rele;
+                goto out_dput;
        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-        error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+        error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
                                        al_hreq.flags, cursor);
        if (error)
                goto out_kfree;
@@ -466,22 +395,20 @@ xfs_compat_attrlist_by_handle(
 out_kfree:
        kfree(kbuf);
- out_vn_rele:
+ out_dput:
-        iput(inode);
+        dput(dentry);
- out:
+        return error;
-        return -error;
 }
 STATIC int
 xfs_compat_attrmulti_by_handle(
-        xfs_mount_t                             *mp,
+        struct file                             *parfilp,
-        void                                    __user *arg,
+        void                                    __user *arg)
-        struct inode                            *parinode)
 {
        int                                     error;
        compat_xfs_attr_multiop_t               *ops;
        compat_xfs_fsop_attrmulti_handlereq_t   am_hreq;
-        struct inode                            *inode;
+        struct dentry                           *dentry;
        unsigned int                            i, size;
        char                                    *attr_name;
@@ -491,20 +418,19 @@ xfs_compat_attrmulti_by_handle(
                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq,
+        dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
-                                               &inode);
+        if (IS_ERR(dentry))
-        if (error)
+                return PTR_ERR(dentry);
-                goto out;
        error = E2BIG;
        size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
        if (!size || size > 16 * PAGE_SIZE)
-                goto out_vn_rele;
+                goto out_dput;
        error = ENOMEM;
        ops = kmalloc(size, GFP_KERNEL);
        if (!ops)
-                goto out_vn_rele;
+                goto out_dput;
        error = EFAULT;
        if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
@@ -527,20 +453,29 @@ xfs_compat_attrmulti_by_handle(
                switch (ops[i].am_opcode) {
                case ATTR_OP_GET:
-                        ops[i].am_error = xfs_attrmulti_attr_get(inode,
+                        ops[i].am_error = xfs_attrmulti_attr_get(
-                                        attr_name,
+                                        dentry->d_inode, attr_name,
                                        compat_ptr(ops[i].am_attrvalue),
                                        &ops[i].am_length, ops[i].am_flags);
                        break;
                case ATTR_OP_SET:
-                        ops[i].am_error = xfs_attrmulti_attr_set(inode,
+                        ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
-                                        attr_name,
+                        if (ops[i].am_error)
+                                break;
+                        ops[i].am_error = xfs_attrmulti_attr_set(
+                                        dentry->d_inode, attr_name,
                                        compat_ptr(ops[i].am_attrvalue),
                                        ops[i].am_length, ops[i].am_flags);
+                        mnt_drop_write(parfilp->f_path.mnt);
                        break;
                case ATTR_OP_REMOVE:
-                        ops[i].am_error = xfs_attrmulti_attr_remove(inode,
+                        ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
-                                        attr_name, ops[i].am_flags);
+                        if (ops[i].am_error)
+                                break;
+                        ops[i].am_error = xfs_attrmulti_attr_remove(
+                                        dentry->d_inode, attr_name,
+                                        ops[i].am_flags);
+                        mnt_drop_write(parfilp->f_path.mnt);
                        break;
                default:
                        ops[i].am_error = EINVAL;
@@ -553,22 +488,20 @@ xfs_compat_attrmulti_by_handle(
        kfree(attr_name);
 out_kfree_ops:
        kfree(ops);
- out_vn_rele:
+ out_dput:
-        iput(inode);
+        dput(dentry);
- out:
        return -error;
 }
 STATIC int
 xfs_compat_fssetdm_by_handle(
-        xfs_mount_t             *mp,
+        struct file             *parfilp,
-        void                    __user *arg,
+        void                    __user *arg)
-        struct inode            *parinode)
 {
        int                     error;
        struct fsdmidata        fsd;
        compat_xfs_fsop_setdm_handlereq_t dmhreq;
-        struct inode            *inode;
+        struct dentry           *dentry;
        if (!capable(CAP_MKNOD))
                return -XFS_ERROR(EPERM);
@@ -576,12 +509,11 @@ xfs_compat_fssetdm_by_handle(
                           sizeof(compat_xfs_fsop_setdm_handlereq_t)))
                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq,
+        dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
-                                               &inode);
+        if (IS_ERR(dentry))
-        if (error)
+                return PTR_ERR(dentry);
-                return -error;
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+        if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
                error = -XFS_ERROR(EPERM);
                goto out;
        }
@@ -591,11 +523,11 @@ xfs_compat_fssetdm_by_handle(
                goto out;
        }
-        error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+        error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
                                 fsd.fsd_dmstate);
 out:
-        iput(inode);
+        dput(dentry);
        return error;
 }
@@ -632,8 +564,6 @@ xfs_file_compat_ioctl(
        case XFS_IOC_SET_RESBLKS:
        case XFS_IOC_GET_RESBLKS:
        case XFS_IOC_FSGROWFSLOG:
-        case XFS_IOC_FREEZE:
-        case XFS_IOC_THAW:
        case XFS_IOC_GOINGDOWN:
        case XFS_IOC_ERROR_INJECTION:
        case XFS_IOC_ERROR_CLEARALL:
@@ -724,21 +654,21 @@ xfs_file_compat_ioctl(
                if (xfs_compat_handlereq_copyin(&hreq, arg))
                        return -XFS_ERROR(EFAULT);
-                return xfs_open_by_handle(mp, &hreq, filp, inode);
+                return xfs_open_by_handle(filp, &hreq);
        }
        case XFS_IOC_READLINK_BY_HANDLE_32: {
                struct xfs_fsop_handlereq       hreq;
                if (xfs_compat_handlereq_copyin(&hreq, arg))
                        return -XFS_ERROR(EFAULT);
-                return xfs_readlink_by_handle(mp, &hreq, inode);
+                return xfs_readlink_by_handle(filp, &hreq);
        }
        case XFS_IOC_ATTRLIST_BY_HANDLE_32:
-                return xfs_compat_attrlist_by_handle(mp, arg, inode);
+                return xfs_compat_attrlist_by_handle(filp, arg);
        case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
-                return xfs_compat_attrmulti_by_handle(mp, arg, inode);
+                return xfs_compat_attrmulti_by_handle(filp, arg);
        case XFS_IOC_FSSETDM_BY_HANDLE_32:
-                return xfs_compat_fssetdm_by_handle(mp, arg, inode);
+                return xfs_compat_fssetdm_by_handle(filp, arg);
        default:
                return -XFS_ERROR(ENOIOCTLCMD);
        }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 36f6cc703ef2..c71e226da7f5 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
        struct xfs_mount        *mp = XFS_M(sb);
        substring_t             args[MAX_OPT_ARGS];
        char                    *p;
+        int                     error;
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
@@ -1247,11 +1248,25 @@ xfs_fs_remount(
                }
        }
-        /* rw/ro -> rw */
+        /* ro -> rw */
        if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
                mp->m_flags &= ~XFS_MOUNT_RDONLY;
                if (mp->m_flags & XFS_MOUNT_BARRIER)
                        xfs_mountfs_check_barriers(mp);
+                /*
+                 * If this is the first remount to writeable state we
+                 * might have some superblock changes to update.
+                 */
+                if (mp->m_update_flags) {
+                        error = xfs_mount_log_sb(mp, mp->m_update_flags);
+                        if (error) {
+                                cmn_err(CE_WARN,
+                                        "XFS: failed to write sb changes");
+                                return error;
+                        }
+                        mp->m_update_flags = 0;
+                }
        }
        /* rw -> ro */
@@ -1269,14 +1284,14 @@ xfs_fs_remount(
 * need to take care of the metadata. Once that's done write a dummy
 * record to dirty the log in case of a crash while frozen.
 */
-STATIC void
+STATIC int
-xfs_fs_lockfs(
+xfs_fs_freeze(
        struct super_block      *sb)
 {
        struct xfs_mount        *mp = XFS_M(sb);
        xfs_quiesce_attr(mp);
-        xfs_fs_log_dummy(mp);
+        return -xfs_fs_log_dummy(mp);
 }
 STATIC int
@@ -1348,7 +1363,7 @@ xfs_finish_flags(
 {
        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
-        /* Fail a mount where the logbuf is smaller then the log stripe */
+        /* Fail a mount where the logbuf is smaller than the log stripe */
        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
                if (mp->m_logbsize <= 0 &&
                    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
@@ -1557,7 +1572,7 @@ static struct super_operations xfs_super_operations = {
        .put_super              = xfs_fs_put_super,
        .write_super            = xfs_fs_write_super,
        .sync_fs                = xfs_fs_sync_super,
-        .write_super_lockfs     = xfs_fs_lockfs,
+        .freeze_fs              = xfs_fs_freeze,
        .statfs                 = xfs_fs_statfs,
        .remount_fs             = xfs_fs_remount,
        .show_options           = xfs_fs_show_options,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 2ed035354c26..a608e72fa405 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -371,7 +371,11 @@ xfs_quiesce_attr(
        /* flush inodes and push all remaining buffers out to disk */
        xfs_quiesce_fs(mp);
-        ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+        /*
+         * Just warn here till VFS can correctly support
+         * read-only remount without racing.
+         */
+        WARN_ON(atomic_read(&mp->m_active_trans) != 0);
        /* Push the superblock and write an unmount record */
        error = xfs_log_sbcount(mp, 1);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 591ca6602bfb..6543c0b29753 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -73,6 +73,8 @@ int xfs_dqreq_num;
 int xfs_dqerror_mod = 33;
 #endif
+static struct lock_class_key xfs_dquot_other_class;
 /*
 * Allocate and initialize a dquot. We don't always allocate fresh memory;
 * we try to reclaim a free dquot if the number of incore dquots are above
@@ -139,7 +141,15 @@ xfs_qm_dqinit(
                 ASSERT(dqp->q_trace);
                 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
 #endif
-         }
+        }
+        /*
+         * In either case we need to make sure group quotas have a different
+         * lock class than user quotas, to make sure lockdep knows we can
+         * locks of one of each at the same time.
+         */
+        if (!(type & XFS_DQ_USER))
+                lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
        /*
         * log item gets initialized later
@@ -421,7 +431,7 @@ xfs_qm_dqalloc(
        /*
         * Initialize the bmap freelist prior to calling bmapi code.
         */
-        XFS_BMAP_INIT(&flist, &firstblock);
+        xfs_bmap_init(&flist, &firstblock);
        xfs_ilock(quotip, XFS_ILOCK_EXCL);
        /*
         * Return if this type of quotas is turned off while we didn't
@@ -1383,6 +1393,12 @@ xfs_dqunlock_nonotify(
        mutex_unlock(&(dqp->q_qlock));
 }
+/*
+ * Lock two xfs_dquot structures.
+ *
+ * To avoid deadlocks we always lock the quota structure with
+ * the lowerd id first.
+ */
 void
 xfs_dqlock2(
        xfs_dquot_t     *d1,
@@ -1392,18 +1408,16 @@ xfs_dqlock2(
                ASSERT(d1 != d2);
                if (be32_to_cpu(d1->q_core.d_id) >
                    be32_to_cpu(d2->q_core.d_id)) {
-                        xfs_dqlock(d2);
+                        mutex_lock(&d2->q_qlock);
-                        xfs_dqlock(d1);
+                        mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
                } else {
-                        xfs_dqlock(d1);
+                        mutex_lock(&d1->q_qlock);
-                        xfs_dqlock(d2);
+                        mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
-                }
-        } else {
-                if (d1) {
-                        xfs_dqlock(d1);
-                } else if (d2) {
-                        xfs_dqlock(d2);
                }
+        } else if (d1) {
+                mutex_lock(&d1->q_qlock);
+        } else if (d2) {
+                mutex_lock(&d2->q_qlock);
        }
 }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 7e455337e2ba..d443e93b4331 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -97,6 +97,16 @@ typedef struct xfs_dquot {
 #define dq_hashlist     q_lists.dqm_hashlist
 #define dq_flags        q_lists.dqm_flags
+/*
+ * Lock hierachy for q_qlock:
+ *      XFS_QLOCK_NORMAL is the implicit default,
+ *      XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ */
+enum {
+        XFS_QLOCK_NORMAL = 0,
+        XFS_QLOCK_NESTED,
+};
 #define XFS_DQHOLD(dqp)         ((dqp)->q_nrefs++)
 #ifdef DEBUG
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 6b13960cf318..7a2beb64314f 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1070,6 +1070,13 @@ xfs_qm_sync(
        return 0;
 }
+/*
+ * The hash chains and the mplist use the same xfs_dqhash structure as
+ * their list head, but we can take the mplist qh_lock and one of the
+ * hash qh_locks at the same time without any problem as they aren't
+ * related.
+ */
+static struct lock_class_key xfs_quota_mplist_class;
 /*
 * This initializes all the quota information that's kept in the
@@ -1105,6 +1112,8 @@ xfs_qm_init_quotainfo(
        }
        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+        lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
        qinf->qi_dqreclaims = 0;
        /* mutex used to serialize quotaoffs */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index a4e293b93efa..642f1db4def4 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,7 +22,6 @@
 * Access Control Lists
 */
 typedef __uint16_t      xfs_acl_perm_t;
-typedef __int32_t       xfs_acl_type_t;
 typedef __int32_t       xfs_acl_tag_t;
 typedef __int32_t       xfs_acl_id_t;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f2e21817a226..143d63ecb20a 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -231,7 +231,7 @@ typedef struct xfs_perag
 #define XFS_FSB_TO_AGNO(mp,fsbno)       \
        ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
 #define XFS_FSB_TO_AGBNO(mp,fsbno)      \
-        ((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog)))
+        ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
 #define XFS_AGB_TO_DADDR(mp,agno,agbno) \
        ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
                (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
@@ -244,8 +244,8 @@ typedef struct xfs_perag
 #define XFS_AG_CHECK_DADDR(mp,d,len)    \
        ((len) == 1 ? \
            ASSERT((d) == XFS_SB_DADDR || \
-                   XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \
+                   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
-            ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \
+            ASSERT(xfs_daddr_to_agno(mp, d) == \
-                   XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1)))
+                   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
 #endif  /* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 733cb75a8c5d..c10c3a292d30 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -115,7 +115,7 @@ xfs_allocbt_free_block(
        xfs_agblock_t           bno;
        int                     error;
-        bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
+        bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
        error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
        if (error)
                return error;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index f7cdc28aff41..5fde1654b430 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -374,7 +374,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
                 * It won't fit in the shortform, transform to a leaf block.
                 * GROT: another possible req'mt for a double-split btree op.
                 */
-                XFS_BMAP_INIT(args.flist, args.firstblock);
+                xfs_bmap_init(args.flist, args.firstblock);
                error = xfs_attr_shortform_to_leaf(&args);
                if (!error) {
                        error = xfs_bmap_finish(&args.trans, args.flist,
@@ -956,7 +956,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * Commit that transaction so that the node_addname() call
                 * can manage its own transactions.
                 */
-                XFS_BMAP_INIT(args->flist, args->firstblock);
+                xfs_bmap_init(args->flist, args->firstblock);
                error = xfs_attr_leaf_to_node(args);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1057,7 +1057,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * If the result is small enough, shrink it all into the inode.
                 */
                if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-                        XFS_BMAP_INIT(args->flist, args->firstblock);
+                        xfs_bmap_init(args->flist, args->firstblock);
                        error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
                        /* bp is gone due to xfs_da_shrink_inode */
                        if (!error) {
@@ -1135,7 +1135,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
         * If the result is small enough, shrink it all into the inode.
         */
        if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-                XFS_BMAP_INIT(args->flist, args->firstblock);
+                xfs_bmap_init(args->flist, args->firstblock);
                error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
                /* bp is gone due to xfs_da_shrink_inode */
                if (!error) {
@@ -1290,7 +1290,7 @@ restart:
                         * have been a b-tree.
                         */
                        xfs_da_state_free(state);
-                        XFS_BMAP_INIT(args->flist, args->firstblock);
+                        xfs_bmap_init(args->flist, args->firstblock);
                        error = xfs_attr_leaf_to_node(args);
                        if (!error) {
                                error = xfs_bmap_finish(&args->trans,
@@ -1331,7 +1331,7 @@ restart:
                 * in the index/blkno/rmtblkno/rmtblkcnt fields and
                 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
                 */
-                XFS_BMAP_INIT(args->flist, args->firstblock);
+                xfs_bmap_init(args->flist, args->firstblock);
                error = xfs_da_split(state);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1443,7 +1443,7 @@ restart:
                 * Check to see if the tree needs to be collapsed.
                 */
                if (retval && (state->path.active > 1)) {
-                        XFS_BMAP_INIT(args->flist, args->firstblock);
+                        xfs_bmap_init(args->flist, args->firstblock);
                        error = xfs_da_join(state);
                        if (!error) {
                                error = xfs_bmap_finish(&args->trans,
@@ -1579,7 +1579,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
         * Check to see if the tree needs to be collapsed.
         */
        if (retval && (state->path.active > 1)) {
-                XFS_BMAP_INIT(args->flist, args->firstblock);
+                xfs_bmap_init(args->flist, args->firstblock);
                error = xfs_da_join(state);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1630,7 +1630,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                                                       == XFS_ATTR_LEAF_MAGIC);
                if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-                        XFS_BMAP_INIT(args->flist, args->firstblock);
+                        xfs_bmap_init(args->flist, args->firstblock);
                        error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
                        /* bp is gone due to xfs_da_shrink_inode */
                        if (!error) {
@@ -2069,7 +2069,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                /*
                 * Allocate a single extent, up to the size of the value.
                 */
-                XFS_BMAP_INIT(args->flist, args->firstblock);
+                xfs_bmap_init(args->flist, args->firstblock);
                nmap = 1;
                error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
                                  blkcnt,
@@ -2123,7 +2123,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                /*
                 * Try to remember where we decided to put the value.
                 */
-                XFS_BMAP_INIT(args->flist, args->firstblock);
+                xfs_bmap_init(args->flist, args->firstblock);
                nmap = 1;
                error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
                                  args->rmtblkcnt,
@@ -2188,7 +2188,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                /*
                 * Try to remember where we decided to put the value.
                 */
-                XFS_BMAP_INIT(args->flist, args->firstblock);
+                xfs_bmap_init(args->flist, args->firstblock);
                nmap = 1;
                error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
                                        args->rmtblkcnt,
@@ -2229,7 +2229,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
        blkcnt = args->rmtblkcnt;
        done = 0;
        while (!done) {
-                XFS_BMAP_INIT(args->flist, args->firstblock);
+                xfs_bmap_init(args->flist, args->firstblock);
                error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
                                    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
                                    1, args->firstblock, args->flist,
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 79da6b2ea99e..6c323f8a4cd1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -736,7 +736,7 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
                        continue;               /* don't copy partial entries */
                if (!(entry->flags & XFS_ATTR_LOCAL))
                        return(0);
-                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+                name_loc = xfs_attr_leaf_name_local(leaf, i);
                if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
                        return(0);
                if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
@@ -823,7 +823,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
                if (!entry->nameidx)
                        continue;
                ASSERT(entry->flags & XFS_ATTR_LOCAL);
-                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+                name_loc = xfs_attr_leaf_name_local(leaf, i);
                nargs.name = (char *)name_loc->nameval;
                nargs.namelen = name_loc->namelen;
                nargs.value = (char *)&name_loc->nameval[nargs.namelen];
@@ -1141,14 +1141,14 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
         * as part of this transaction (a split operation for example).
         */
        if (entry->flags & XFS_ATTR_LOCAL) {
-                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+                name_loc = xfs_attr_leaf_name_local(leaf, args->index);
                name_loc->namelen = args->namelen;
                name_loc->valuelen = cpu_to_be16(args->valuelen);
                memcpy((char *)name_loc->nameval, args->name, args->namelen);
                memcpy((char *)&name_loc->nameval[args->namelen], args->value,
                                   be16_to_cpu(name_loc->valuelen));
        } else {
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
                name_rmt->namelen = args->namelen;
                memcpy((char *)name_rmt->name, args->name, args->namelen);
                entry->flags |= XFS_ATTR_INCOMPLETE;
@@ -1159,7 +1159,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
                args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
        }
        xfs_da_log_buf(args->trans, bp,
-             XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+             XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
                                   xfs_attr_leaf_entsize(leaf, args->index)));
        /*
@@ -1749,10 +1749,10 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
        /*
         * Compress the remaining entries and zero out the removed stuff.
         */
-        memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize);
+        memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
        be16_add_cpu(&hdr->usedbytes, -entsize);
        xfs_da_log_buf(args->trans, bp,
-             XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+             XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
                                   entsize));
        tmp = (be16_to_cpu(hdr->count) - args->index)
@@ -1985,7 +1985,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
                        continue;
                }
                if (entry->flags & XFS_ATTR_LOCAL) {
-                        name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
+                        name_loc = xfs_attr_leaf_name_local(leaf, probe);
                        if (name_loc->namelen != args->namelen)
                                continue;
                        if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
@@ -1995,7 +1995,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
                        args->index = probe;
                        return(XFS_ERROR(EEXIST));
                } else {
-                        name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe);
+                        name_rmt = xfs_attr_leaf_name_remote(leaf, probe);
                        if (name_rmt->namelen != args->namelen)
                                continue;
                        if (memcmp(args->name, (char *)name_rmt->name,
@@ -2035,7 +2035,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
        entry = &leaf->entries[args->index];
        if (entry->flags & XFS_ATTR_LOCAL) {
-                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+                name_loc = xfs_attr_leaf_name_local(leaf, args->index);
                ASSERT(name_loc->namelen == args->namelen);
                ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
                valuelen = be16_to_cpu(name_loc->valuelen);
@@ -2050,7 +2050,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
                args->valuelen = valuelen;
                memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
        } else {
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
                ASSERT(name_rmt->namelen == args->namelen);
                ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
                valuelen = be32_to_cpu(name_rmt->valuelen);
@@ -2143,7 +2143,7 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
                 * off for 6.2, should be revisited later.
                 */
                if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
-                        memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
+                        memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
                        be16_add_cpu(&hdr_s->usedbytes, -tmp);
                        be16_add_cpu(&hdr_s->count, -1);
                        entry_d--;      /* to compensate for ++ in loop hdr */
@@ -2160,11 +2160,11 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
                        entry_d->flags = entry_s->flags;
                        ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
                                                        <= XFS_LBSIZE(mp));
-                        memmove(XFS_ATTR_LEAF_NAME(leaf_d, desti),
+                        memmove(xfs_attr_leaf_name(leaf_d, desti),
-                                XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
+                                xfs_attr_leaf_name(leaf_s, start_s + i), tmp);
                        ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
                                                        <= XFS_LBSIZE(mp));
-                        memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
+                        memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
                        be16_add_cpu(&hdr_s->usedbytes, -tmp);
                        be16_add_cpu(&hdr_d->usedbytes, tmp);
                        be16_add_cpu(&hdr_s->count, -1);
@@ -2276,12 +2276,12 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
        if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
-                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index);
+                name_loc = xfs_attr_leaf_name_local(leaf, index);
-                size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen,
+                size = xfs_attr_leaf_entsize_local(name_loc->namelen,
                                                   be16_to_cpu(name_loc->valuelen));
        } else {
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index);
+                name_rmt = xfs_attr_leaf_name_remote(leaf, index);
-                size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen);
+                size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
        }
        return(size);
 }
@@ -2297,13 +2297,13 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
 {
        int size;
-        size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(namelen, valuelen);
+        size = xfs_attr_leaf_entsize_local(namelen, valuelen);
-        if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) {
+        if (size < xfs_attr_leaf_entsize_local_max(blocksize)) {
                if (local) {
                        *local = 1;
                }
        } else {
-                size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(namelen);
+                size = xfs_attr_leaf_entsize_remote(namelen);
                if (local) {
                        *local = 0;
                }
@@ -2372,7 +2372,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                if (entry->flags & XFS_ATTR_LOCAL) {
                        xfs_attr_leaf_name_local_t *name_loc =
-                                XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+                                xfs_attr_leaf_name_local(leaf, i);
                        retval = context->put_listent(context,
                                                entry->flags,
@@ -2384,7 +2384,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                return retval;
                } else {
                        xfs_attr_leaf_name_remote_t *name_rmt =
-                                XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+                                xfs_attr_leaf_name_remote(leaf, i);
                        int valuelen = be32_to_cpu(name_rmt->valuelen);
@@ -2468,11 +2468,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 #ifdef DEBUG
        if (entry->flags & XFS_ATTR_LOCAL) {
-                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+                name_loc = xfs_attr_leaf_name_local(leaf, args->index);
                namelen = name_loc->namelen;
                name = (char *)name_loc->nameval;
        } else {
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
                namelen = name_rmt->namelen;
                name = (char *)name_rmt->name;
        }
@@ -2487,7 +2487,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
        if (args->rmtblkno) {
                ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
                name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
                name_rmt->valuelen = cpu_to_be32(args->valuelen);
                xfs_da_log_buf(args->trans, bp,
@@ -2534,7 +2534,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
        xfs_da_log_buf(args->trans, bp,
                        XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
        if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
                name_rmt->valueblk = 0;
                name_rmt->valuelen = 0;
                xfs_da_log_buf(args->trans, bp,
@@ -2607,20 +2607,20 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 #ifdef DEBUG
        if (entry1->flags & XFS_ATTR_LOCAL) {
-                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index);
+                name_loc = xfs_attr_leaf_name_local(leaf1, args->index);
                namelen1 = name_loc->namelen;
                name1 = (char *)name_loc->nameval;
        } else {
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+                name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
                namelen1 = name_rmt->namelen;
                name1 = (char *)name_rmt->name;
        }
        if (entry2->flags & XFS_ATTR_LOCAL) {
-                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2);
+                name_loc = xfs_attr_leaf_name_local(leaf2, args->index2);
                namelen2 = name_loc->namelen;
                name2 = (char *)name_loc->nameval;
        } else {
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+                name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
                namelen2 = name_rmt->namelen;
                name2 = (char *)name_rmt->name;
        }
@@ -2637,7 +2637,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
                          XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
        if (args->rmtblkno) {
                ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+                name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
                name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
                name_rmt->valuelen = cpu_to_be32(args->valuelen);
                xfs_da_log_buf(args->trans, bp1,
@@ -2648,7 +2648,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
        xfs_da_log_buf(args->trans, bp2,
                          XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
        if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
-                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+                name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
                name_rmt->valueblk = 0;
                name_rmt->valuelen = 0;
                xfs_da_log_buf(args->trans, bp2,
@@ -2855,7 +2855,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
        for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
                if (be16_to_cpu(entry->nameidx) &&
                    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-                        name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+                        name_rmt = xfs_attr_leaf_name_remote(leaf, i);
                        if (name_rmt->valueblk)
                                count++;
                }
@@ -2883,7 +2883,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
        for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
                if (be16_to_cpu(entry->nameidx) &&
                    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-                        name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+                        name_rmt = xfs_attr_leaf_name_remote(leaf, i);
                        if (name_rmt->valueblk) {
                                lp->valueblk = be32_to_cpu(name_rmt->valueblk);
                                lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 83e9af417ca2..9c7d22fdcf4d 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -151,8 +151,6 @@ typedef struct xfs_attr_leafblock {
 /*
 * Cast typed pointers for "local" and "remote" name/value structs.
 */
-#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx)    \
-        xfs_attr_leaf_name_remote(leafp,idx)
 static inline xfs_attr_leaf_name_remote_t *
 xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
 {
@@ -160,8 +158,6 @@ xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
                &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
 }
-#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx)     \
-        xfs_attr_leaf_name_local(leafp,idx)
 static inline xfs_attr_leaf_name_local_t *
 xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
 {
@@ -169,8 +165,6 @@ xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
                &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
 }
-#define XFS_ATTR_LEAF_NAME(leafp,idx)           \
-        xfs_attr_leaf_name(leafp,idx)
 static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
 {
        return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
@@ -181,24 +175,18 @@ static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
 * a "local" name/value structure, a "remote" name/value structure, and
 * a pointer which might be either.
 */
-#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen)      \
-        xfs_attr_leaf_entsize_remote(nlen)
 static inline int xfs_attr_leaf_entsize_remote(int nlen)
 {
        return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
                XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
 }
-#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen)  \
-        xfs_attr_leaf_entsize_local(nlen,vlen)
 static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
 {
        return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
                XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
 }
-#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize)  \
-        xfs_attr_leaf_entsize_local_max(bsize)
 static inline int xfs_attr_leaf_entsize_local_max(int bsize)
 {
        return (((bsize) >> 1) + ((bsize) >> 2));
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index bca7b243c319..f1e3c907044d 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -23,24 +23,16 @@
 */
 /*
- * masks with n high/low bits set, 32-bit values & 64-bit values
+ * masks with n high/low bits set, 64-bit values
 */
-#define XFS_MASK32HI(n)         xfs_mask32hi(n)
-static inline __uint32_t xfs_mask32hi(int n)
-{
-        return (__uint32_t)-1 << (32 - (n));
-}
-#define XFS_MASK64HI(n)         xfs_mask64hi(n)
 static inline __uint64_t xfs_mask64hi(int n)
 {
        return (__uint64_t)-1 << (64 - (n));
 }
-#define XFS_MASK32LO(n)         xfs_mask32lo(n)
 static inline __uint32_t xfs_mask32lo(int n)
 {
        return ((__uint32_t)1 << (n)) - 1;
 }
-#define XFS_MASK64LO(n)         xfs_mask64lo(n)
 static inline __uint64_t xfs_mask64lo(int n)
 {
        return ((__uint64_t)1 << (n)) - 1;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 138308e70d14..c852cd65aaea 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -595,9 +595,9 @@ xfs_bmap_add_extent(
                xfs_iext_insert(ifp, 0, 1, new);
                ASSERT(cur == NULL);
                ifp->if_lastex = 0;
-                if (!ISNULLSTARTBLOCK(new->br_startblock)) {
+                if (!isnullstartblock(new->br_startblock)) {
                        XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-                        logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+                        logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
                } else
                        logflags = 0;
                /* DELTA: single new extent */
@@ -613,7 +613,7 @@ xfs_bmap_add_extent(
        /*
         * Any kind of new delayed allocation goes here.
         */
-        else if (ISNULLSTARTBLOCK(new->br_startblock)) {
+        else if (isnullstartblock(new->br_startblock)) {
                if (cur)
                        ASSERT((cur->bc_private.b.flags &
                                XFS_BTCUR_BPRV_WASDEL) == 0);
@@ -644,11 +644,11 @@ xfs_bmap_add_extent(
                 * in a delayed or unwritten allocation with a real one, or
                 * converting real back to unwritten.
                 */
-                if (!ISNULLSTARTBLOCK(new->br_startblock) &&
+                if (!isnullstartblock(new->br_startblock) &&
                    new->br_startoff + new->br_blockcount > prev.br_startoff) {
                        if (prev.br_state != XFS_EXT_UNWRITTEN &&
-                            ISNULLSTARTBLOCK(prev.br_startblock)) {
+                            isnullstartblock(prev.br_startblock)) {
-                                da_old = STARTBLOCKVAL(prev.br_startblock);
+                                da_old = startblockval(prev.br_startblock);
                                if (cur)
                                        ASSERT(cur->bc_private.b.flags &
                                                XFS_BTCUR_BPRV_WASDEL);
@@ -803,7 +803,7 @@ xfs_bmap_add_extent_delay_real(
         */
        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-                STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+                STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
        }
        STATE_SET(LEFT_CONTIG,
                STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -820,7 +820,7 @@ xfs_bmap_add_extent_delay_real(
                        idx <
                        ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-                STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+                STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
        }
        STATE_SET(RIGHT_CONTIG,
                STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1019,8 +1019,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-                        STARTBLOCKVAL(PREV.br_startblock));
+                        startblockval(PREV.br_startblock));
-                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
                *dnew = temp;
                /* DELTA: The boundary between two in-core extents moved. */
@@ -1067,10 +1067,10 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-                        STARTBLOCKVAL(PREV.br_startblock) -
+                        startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
                ep = xfs_iext_get_ext(ifp, idx + 1);
-                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK);
                *dnew = temp;
                /* DELTA: One in-core extent is split in two. */
@@ -1110,8 +1110,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-                        STARTBLOCKVAL(PREV.br_startblock));
+                        startblockval(PREV.br_startblock));
-                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
                *dnew = temp;
                /* DELTA: The boundary between two in-core extents moved. */
@@ -1157,10 +1157,10 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                }
                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-                        STARTBLOCKVAL(PREV.br_startblock) -
+                        startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
                ep = xfs_iext_get_ext(ifp, idx);
-                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
                *dnew = temp;
                /* DELTA: One in-core extent is split in two. */
@@ -1213,7 +1213,7 @@ xfs_bmap_add_extent_delay_real(
                }
                temp = xfs_bmap_worst_indlen(ip, temp);
                temp2 = xfs_bmap_worst_indlen(ip, temp2);
-                diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) -
+                diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
                        (cur ? cur->bc_private.b.allocated : 0));
                if (diff > 0 &&
                    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
@@ -1241,11 +1241,11 @@ xfs_bmap_add_extent_delay_real(
                        }
                }
                ep = xfs_iext_get_ext(ifp, idx);
-                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
                XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
-                        NULLSTARTBLOCK((int)temp2));
+                        nullstartblock((int)temp2));
                XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
                *dnew = temp + temp2;
                /* DELTA: One in-core extent is split in three. */
@@ -1365,7 +1365,7 @@ xfs_bmap_add_extent_unwritten_real(
         */
        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-                STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+                STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
        }
        STATE_SET(LEFT_CONTIG,
                STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -1382,7 +1382,7 @@ xfs_bmap_add_extent_unwritten_real(
                        idx <
                        ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-                STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+                STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
        }
        STATE_SET(RIGHT_CONTIG,
                STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1889,13 +1889,13 @@ xfs_bmap_add_extent_hole_delay(
        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
        ep = xfs_iext_get_ext(ifp, idx);
        state = 0;
-        ASSERT(ISNULLSTARTBLOCK(new->br_startblock));
+        ASSERT(isnullstartblock(new->br_startblock));
        /*
         * Check and set flags if this segment has a left neighbor
         */
        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-                STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+                STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
        }
        /*
         * Check and set flags if the current (right) segment exists.
@@ -1905,7 +1905,7 @@ xfs_bmap_add_extent_hole_delay(
                           idx <
                           ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
                xfs_bmbt_get_all(ep, &right);
-                STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+                STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
        }
        /*
         * Set contiguity flags on the left and right neighbors.
@@ -1938,12 +1938,12 @@ xfs_bmap_add_extent_hole_delay(
                XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
-                oldlen = STARTBLOCKVAL(left.br_startblock) +
+                oldlen = startblockval(left.br_startblock) +
-                        STARTBLOCKVAL(new->br_startblock) +
+                        startblockval(new->br_startblock) +
-                        STARTBLOCKVAL(right.br_startblock);
+                        startblockval(right.br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
-                        NULLSTARTBLOCK((int)newlen));
+                        nullstartblock((int)newlen));
                XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
                        XFS_DATA_FORK);
                XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK);
@@ -1964,11 +1964,11 @@ xfs_bmap_add_extent_hole_delay(
                XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1,
                        XFS_DATA_FORK);
                xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
-                oldlen = STARTBLOCKVAL(left.br_startblock) +
+                oldlen = startblockval(left.br_startblock) +
-                        STARTBLOCKVAL(new->br_startblock);
+                        startblockval(new->br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
                xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
-                        NULLSTARTBLOCK((int)newlen));
+                        nullstartblock((int)newlen));
                XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1,
                        XFS_DATA_FORK);
                ip->i_df.if_lastex = idx - 1;
@@ -1985,11 +1985,11 @@ xfs_bmap_add_extent_hole_delay(
                 */
                XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK);
                temp = new->br_blockcount + right.br_blockcount;
-                oldlen = STARTBLOCKVAL(new->br_startblock) +
+                oldlen = startblockval(new->br_startblock) +
-                        STARTBLOCKVAL(right.br_startblock);
+                        startblockval(right.br_startblock);
                newlen = xfs_bmap_worst_indlen(ip, temp);
                xfs_bmbt_set_allf(ep, new->br_startoff,
-                        NULLSTARTBLOCK((int)newlen), temp, right.br_state);
+                        nullstartblock((int)newlen), temp, right.br_state);
                XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK);
                ip->i_df.if_lastex = idx;
                /* DELTA: One in-core extent grew into a hole. */
@@ -2085,7 +2085,7 @@ xfs_bmap_add_extent_hole_real(
         */
        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
                xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-                STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+                STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
        }
        /*
         * Check and set flags if this segment has a current value.
@@ -2095,7 +2095,7 @@ xfs_bmap_add_extent_hole_real(
                           idx <
                           ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
                xfs_bmbt_get_all(ep, &right);
-                STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+                STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
        }
        /*
         * We're inserting a real allocation between "left" and "right".
@@ -2143,7 +2143,7 @@ xfs_bmap_add_extent_hole_real(
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
                if (cur == NULL) {
-                        rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+                        rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
                } else {
                        rval = XFS_ILOG_CORE;
                        if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2185,7 +2185,7 @@ xfs_bmap_add_extent_hole_real(
                XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork);
                ifp->if_lastex = idx - 1;
                if (cur == NULL) {
-                        rval = XFS_ILOG_FEXT(whichfork);
+                        rval = xfs_ilog_fext(whichfork);
                } else {
                        rval = 0;
                        if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2220,7 +2220,7 @@ xfs_bmap_add_extent_hole_real(
                XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork);
                ifp->if_lastex = idx;
                if (cur == NULL) {
-                        rval = XFS_ILOG_FEXT(whichfork);
+                        rval = xfs_ilog_fext(whichfork);
                } else {
                        rval = 0;
                        if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2254,7 +2254,7 @@ xfs_bmap_add_extent_hole_real(
                XFS_IFORK_NEXT_SET(ip, whichfork,
                        XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
                if (cur == NULL) {
-                        rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+                        rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
                } else {
                        rval = XFS_ILOG_CORE;
                        if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2482,7 +2482,7 @@ xfs_bmap_adjacent(
         * try to use it's last block as our starting point.
         */
        if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
-            !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+            !isnullstartblock(ap->prevp->br_startblock) &&
            ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
                    ap->prevp->br_startblock)) {
                ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
@@ -2511,7 +2511,7 @@ xfs_bmap_adjacent(
                 * start block based on it.
                 */
                if (ap->prevp->br_startoff != NULLFILEOFF &&
-                    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+                    !isnullstartblock(ap->prevp->br_startblock) &&
                    (prevbno = ap->prevp->br_startblock +
                               ap->prevp->br_blockcount) &&
                    ISVALID(prevbno, ap->prevp->br_startblock)) {
@@ -2552,7 +2552,7 @@ xfs_bmap_adjacent(
                 * If there's a following (right) block, select a requested
                 * start block based on it.
                 */
-                if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) {
+                if (!isnullstartblock(ap->gotp->br_startblock)) {
                        /*
                         * Calculate gap to start of next block.
                         */
@@ -3082,7 +3082,7 @@ xfs_bmap_btree_to_extents(
        ASSERT(ifp->if_broot == NULL);
        ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+        *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
        return 0;
 }
@@ -3136,8 +3136,8 @@ xfs_bmap_del_extent(
        del_endoff = del->br_startoff + del->br_blockcount;
        got_endoff = got.br_startoff + got.br_blockcount;
        ASSERT(got_endoff >= del_endoff);
-        delay = ISNULLSTARTBLOCK(got.br_startblock);
+        delay = isnullstartblock(got.br_startblock);
-        ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay);
+        ASSERT(isnullstartblock(del->br_startblock) == delay);
        flags = 0;
        qfield = 0;
        error = 0;
@@ -3189,7 +3189,7 @@ xfs_bmap_del_extent(
                }
                da_old = da_new = 0;
        } else {
-                da_old = STARTBLOCKVAL(got.br_startblock);
+                da_old = startblockval(got.br_startblock);
                da_new = 0;
                nblks = 0;
                do_fx = 0;
@@ -3213,7 +3213,7 @@ xfs_bmap_del_extent(
                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
                flags |= XFS_ILOG_CORE;
                if (!cur) {
-                        flags |= XFS_ILOG_FEXT(whichfork);
+                        flags |= xfs_ilog_fext(whichfork);
                        break;
                }
                if ((error = xfs_btree_delete(cur, &i)))
@@ -3233,7 +3233,7 @@ xfs_bmap_del_extent(
                if (delay) {
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
-                        xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                        XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx,
                                whichfork);
                        da_new = temp;
@@ -3242,7 +3242,7 @@ xfs_bmap_del_extent(
                xfs_bmbt_set_startblock(ep, del_endblock);
                XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork);
                if (!cur) {
-                        flags |= XFS_ILOG_FEXT(whichfork);
+                        flags |= xfs_ilog_fext(whichfork);
                        break;
                }
                if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
@@ -3262,7 +3262,7 @@ xfs_bmap_del_extent(
                if (delay) {
                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
                                da_old);
-                        xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                        XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx,
                                whichfork);
                        da_new = temp;
@@ -3270,7 +3270,7 @@ xfs_bmap_del_extent(
                }
                XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork);
                if (!cur) {
-                        flags |= XFS_ILOG_FEXT(whichfork);
+                        flags |= xfs_ilog_fext(whichfork);
                        break;
                }
                if ((error = xfs_bmbt_update(cur, got.br_startoff,
@@ -3345,22 +3345,22 @@ xfs_bmap_del_extent(
                                }
                                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        } else
-                                flags |= XFS_ILOG_FEXT(whichfork);
+                                flags |= xfs_ilog_fext(whichfork);
                        XFS_IFORK_NEXT_SET(ip, whichfork,
                                XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
                } else {
                        ASSERT(whichfork == XFS_DATA_FORK);
                        temp = xfs_bmap_worst_indlen(ip, temp);
-                        xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                        xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
                        temp2 = xfs_bmap_worst_indlen(ip, temp2);
-                        new.br_startblock = NULLSTARTBLOCK((int)temp2);
+                        new.br_startblock = nullstartblock((int)temp2);
                        da_new = temp + temp2;
                        while (da_new > da_old) {
                                if (temp) {
                                        temp--;
                                        da_new--;
                                        xfs_bmbt_set_startblock(ep,
-                                                NULLSTARTBLOCK((int)temp));
+                                                nullstartblock((int)temp));
                                }
                                if (da_new == da_old)
                                        break;
@@ -3368,7 +3368,7 @@ xfs_bmap_del_extent(
                                        temp2--;
                                        da_new--;
                                        new.br_startblock =
-                                                NULLSTARTBLOCK((int)temp2);
+                                                nullstartblock((int)temp2);
                                }
                        }
                }
@@ -3545,7 +3545,7 @@ xfs_bmap_extents_to_btree(
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        for (cnt = i = 0; i < nextents; i++) {
                ep = xfs_iext_get_ext(ifp, i);
-                if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) {
+                if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
                        arp->l0 = cpu_to_be64(ep->l0);
                        arp->l1 = cpu_to_be64(ep->l1);
                        arp++; cnt++;
@@ -3572,7 +3572,7 @@ xfs_bmap_extents_to_btree(
        xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
        ASSERT(*curp == NULL);
        *curp = cur;
-        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
+        *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
        return 0;
 }
@@ -3676,7 +3676,7 @@ xfs_bmap_local_to_extents(
                ip->i_d.di_nblocks = 1;
                XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
                        XFS_TRANS_DQ_BCOUNT, 1L);
-                flags |= XFS_ILOG_FEXT(whichfork);
+                flags |= xfs_ilog_fext(whichfork);
        } else {
                ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
                xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
@@ -4082,7 +4082,7 @@ xfs_bmap_add_attrfork(
                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        ip->i_afp->if_flags = XFS_IFEXTENTS;
        logflags = 0;
-        XFS_BMAP_INIT(&flist, &firstblock);
+        xfs_bmap_init(&flist, &firstblock);
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_LOCAL:
                error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
@@ -4162,7 +4162,7 @@ xfs_bmap_add_free(
        ASSERT(bno != NULLFSBLOCK);
        ASSERT(len > 0);
        ASSERT(len <= MAXEXTLEN);
-        ASSERT(!ISNULLSTARTBLOCK(bno));
+        ASSERT(!isnullstartblock(bno));
        agno = XFS_FSB_TO_AGNO(mp, bno);
        agbno = XFS_FSB_TO_AGBNO(mp, bno);
        ASSERT(agno < mp->m_sb.sb_agcount);
@@ -4909,7 +4909,7 @@ xfs_bmapi(
                        got.br_startoff = end;
                inhole = eof || got.br_startoff > bno;
                wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
-                        ISNULLSTARTBLOCK(got.br_startblock);
+                        isnullstartblock(got.br_startblock);
                /*
                 * First, deal with the hole before the allocated space
                 * that we found, if any.
@@ -5028,7 +5028,7 @@ xfs_bmapi(
                                }
                                ip->i_delayed_blks += alen;
-                                abno = NULLSTARTBLOCK(indlen);
+                                abno = nullstartblock(indlen);
                        } else {
                                /*
                                 * If first time, allocate and fill in
@@ -5144,8 +5144,8 @@ xfs_bmapi(
                                aoff + alen);
 #ifdef DEBUG
                        if (flags & XFS_BMAPI_DELAY) {
-                                ASSERT(ISNULLSTARTBLOCK(got.br_startblock));
+                                ASSERT(isnullstartblock(got.br_startblock));
-                                ASSERT(STARTBLOCKVAL(got.br_startblock) > 0);
+                                ASSERT(startblockval(got.br_startblock) > 0);
                        }
                        ASSERT(got.br_state == XFS_EXT_NORM ||
                               got.br_state == XFS_EXT_UNWRITTEN);
@@ -5179,7 +5179,7 @@ xfs_bmapi(
                        ASSERT((bno >= obno) || (n == 0));
                        ASSERT(bno < end);
                        mval->br_startoff = bno;
-                        if (ISNULLSTARTBLOCK(got.br_startblock)) {
+                        if (isnullstartblock(got.br_startblock)) {
                                ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
                                mval->br_startblock = DELAYSTARTBLOCK;
                        } else
@@ -5201,7 +5201,7 @@ xfs_bmapi(
                        ASSERT(mval->br_blockcount <= len);
                } else {
                        *mval = got;
-                        if (ISNULLSTARTBLOCK(mval->br_startblock)) {
+                        if (isnullstartblock(mval->br_startblock)) {
                                ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
                                mval->br_startblock = DELAYSTARTBLOCK;
                        }
@@ -5329,12 +5329,12 @@ error0:
         * Log everything.  Do this after conversion, there's no point in
         * logging the extent records if we've converted to btree format.
         */
-        if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+        if ((logflags & xfs_ilog_fext(whichfork)) &&
            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-                logflags &= ~XFS_ILOG_FEXT(whichfork);
+                logflags &= ~xfs_ilog_fext(whichfork);
-        else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+        else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
                 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-                logflags &= ~XFS_ILOG_FBROOT(whichfork);
+                logflags &= ~xfs_ilog_fbroot(whichfork);
        /*
         * Log whatever the flags say, even if error.  Otherwise we might miss
         * detecting a case where the data is changed, there's an error,
@@ -5411,7 +5411,7 @@ xfs_bmapi_single(
                *fsb = NULLFSBLOCK;
                return 0;
        }
-        ASSERT(!ISNULLSTARTBLOCK(got.br_startblock));
+        ASSERT(!isnullstartblock(got.br_startblock));
        ASSERT(bno < got.br_startoff + got.br_blockcount);
        *fsb = got.br_startblock + (bno - got.br_startoff);
        ifp->if_lastex = lastx;
@@ -5543,7 +5543,7 @@ xfs_bunmapi(
                 */
                ASSERT(ep != NULL);
                del = got;
-                wasdel = ISNULLSTARTBLOCK(del.br_startblock);
+                wasdel = isnullstartblock(del.br_startblock);
                if (got.br_startoff < start) {
                        del.br_startoff = start;
                        del.br_blockcount -= start - got.br_startoff;
@@ -5638,7 +5638,7 @@ xfs_bunmapi(
                                xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
                                                lastx - 1), &prev);
                                ASSERT(prev.br_state == XFS_EXT_NORM);
-                                ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock));
+                                ASSERT(!isnullstartblock(prev.br_startblock));
                                ASSERT(del.br_startblock ==
                                       prev.br_startblock + prev.br_blockcount);
                                if (prev.br_startoff < start) {
@@ -5666,7 +5666,7 @@ xfs_bunmapi(
                        }
                }
                if (wasdel) {
-                        ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
+                        ASSERT(startblockval(del.br_startblock) > 0);
                        /* Update realtime/data freespace, unreserve quota */
                        if (isrt) {
                                xfs_filblks_t rtexts;
@@ -5782,12 +5782,12 @@ error0:
         * Log everything.  Do this after conversion, there's no point in
         * logging the extent records if we've converted to btree format.
         */
-        if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+        if ((logflags & xfs_ilog_fext(whichfork)) &&
            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-                logflags &= ~XFS_ILOG_FEXT(whichfork);
+                logflags &= ~xfs_ilog_fext(whichfork);
-        else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+        else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
                 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-                logflags &= ~XFS_ILOG_FBROOT(whichfork);
+                logflags &= ~xfs_ilog_fbroot(whichfork);
        /*
         * Log inode even in the error case, if the transaction
         * is dirty we'll need to shut down the filesystem.
@@ -5838,7 +5838,7 @@ xfs_getbmapx_fix_eof_hole(
                if (startblock == DELAYSTARTBLOCK)
                        out->bmv_block = -2;
                else
-                        out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+                        out->bmv_block = xfs_fsb_to_db(ip, startblock);
                fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
                ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
                if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
@@ -5979,7 +5979,7 @@ xfs_getbmap(
        if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
                nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
-        bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
+        bmapi_flags = xfs_bmapi_aflag(whichfork) |
                        ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
        /*
@@ -6098,7 +6098,7 @@ xfs_bmap_isaeof(
         */
        *aeof = (off >= s.br_startoff &&
                 off < s.br_startoff + s.br_blockcount &&
-                 ISNULLSTARTBLOCK(s.br_startblock)) ||
+                 isnullstartblock(s.br_startblock)) ||
                off >= s.br_startoff + s.br_blockcount;
        return 0;
 }
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 284571c05ed0..be2979d88d32 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -95,7 +95,6 @@ typedef	struct xfs_bmap_free
                                        /* need write cache flushing and no */
                                        /* additional allocation alignments */
-#define XFS_BMAPI_AFLAG(w)      xfs_bmapi_aflag(w)
 static inline int xfs_bmapi_aflag(int w)
 {
        return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
@@ -107,7 +106,6 @@ static inline int xfs_bmapi_aflag(int w)
 #define DELAYSTARTBLOCK         ((xfs_fsblock_t)-1LL)
 #define HOLESTARTBLOCK          ((xfs_fsblock_t)-2LL)
-#define XFS_BMAP_INIT(flp,fbp)  xfs_bmap_init(flp,fbp)
 static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 {
        ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 8f1ec73725d3..0760d352586f 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -110,25 +110,25 @@ __xfs_bmbt_get_all(
        ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
        s->br_startoff = ((xfs_fileoff_t)l0 &
-                           XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+                           xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 #if XFS_BIG_BLKNOS
-        s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) |
+        s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
                           (((xfs_fsblock_t)l1) >> 21);
 #else
 #ifdef DEBUG
        {
                xfs_dfsbno_t    b;
-                b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) |
+                b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
                    (((xfs_dfsbno_t)l1) >> 21);
-                ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+                ASSERT((b >> 32) == 0 || isnulldstartblock(b));
                s->br_startblock = (xfs_fsblock_t)b;
        }
 #else   /* !DEBUG */
        s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
 #endif  /* DEBUG */
 #endif  /* XFS_BIG_BLKNOS */
-        s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21));
+        s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
        /* This is xfs_extent_state() in-line */
        if (ext_flag) {
                ASSERT(s->br_blockcount != 0);  /* saved for DMIG */
@@ -153,7 +153,7 @@ xfs_filblks_t
 xfs_bmbt_get_blockcount(
        xfs_bmbt_rec_host_t     *r)
 {
-        return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21));
+        return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
 }
 /*
@@ -164,15 +164,15 @@ xfs_bmbt_get_startblock(
        xfs_bmbt_rec_host_t     *r)
 {
 #if XFS_BIG_BLKNOS
-        return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+        return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
               (((xfs_fsblock_t)r->l1) >> 21);
 #else
 #ifdef DEBUG
        xfs_dfsbno_t    b;
-        b = (((xfs_dfsbno_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+        b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
            (((xfs_dfsbno_t)r->l1) >> 21);
-        ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+        ASSERT((b >> 32) == 0 || isnulldstartblock(b));
        return (xfs_fsblock_t)b;
 #else   /* !DEBUG */
        return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
@@ -188,7 +188,7 @@ xfs_bmbt_get_startoff(
        xfs_bmbt_rec_host_t     *r)
 {
        return ((xfs_fileoff_t)r->l0 &
-                 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+                 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
 xfs_exntst_t
@@ -219,7 +219,7 @@ xfs_filblks_t
 xfs_bmbt_disk_get_blockcount(
        xfs_bmbt_rec_t  *r)
 {
-        return (xfs_filblks_t)(be64_to_cpu(r->l1) & XFS_MASK64LO(21));
+        return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
 }
 /*
@@ -230,7 +230,7 @@ xfs_bmbt_disk_get_startoff(
        xfs_bmbt_rec_t  *r)
 {
        return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
-                 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+                 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
@@ -248,33 +248,33 @@ xfs_bmbt_set_allf(
        int             extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
        ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-        ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
+        ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
-        ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+        ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
 #if XFS_BIG_BLKNOS
-        ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+        ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
        r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
                ((xfs_bmbt_rec_base_t)startoff << 9) |
                ((xfs_bmbt_rec_base_t)startblock >> 43);
        r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
                ((xfs_bmbt_rec_base_t)blockcount &
-                (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+                (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 #else   /* !XFS_BIG_BLKNOS */
-        if (ISNULLSTARTBLOCK(startblock)) {
+        if (isnullstartblock(startblock)) {
                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
                        ((xfs_bmbt_rec_base_t)startoff << 9) |
-                         (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+                         (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-                r->l1 = XFS_MASK64HI(11) |
+                r->l1 = xfs_mask64hi(11) |
                          ((xfs_bmbt_rec_base_t)startblock << 21) |
                          ((xfs_bmbt_rec_base_t)blockcount &
-                           (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+                           (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
        } else {
                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
                        ((xfs_bmbt_rec_base_t)startoff << 9);
                r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
                         ((xfs_bmbt_rec_base_t)blockcount &
-                         (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+                         (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
        }
 #endif  /* XFS_BIG_BLKNOS */
 }
@@ -306,11 +306,11 @@ xfs_bmbt_disk_set_allf(
        int                     extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
        ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-        ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
+        ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
-        ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+        ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
 #if XFS_BIG_BLKNOS
-        ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+        ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
        r->l0 = cpu_to_be64(
                ((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -319,17 +319,17 @@ xfs_bmbt_disk_set_allf(
        r->l1 = cpu_to_be64(
                ((xfs_bmbt_rec_base_t)startblock << 21) |
                 ((xfs_bmbt_rec_base_t)blockcount &
-                  (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+                  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
 #else   /* !XFS_BIG_BLKNOS */
-        if (ISNULLSTARTBLOCK(startblock)) {
+        if (isnullstartblock(startblock)) {
                r->l0 = cpu_to_be64(
                        ((xfs_bmbt_rec_base_t)extent_flag << 63) |
                         ((xfs_bmbt_rec_base_t)startoff << 9) |
-                          (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+                          (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
-                r->l1 = cpu_to_be64(XFS_MASK64HI(11) |
+                r->l1 = cpu_to_be64(xfs_mask64hi(11) |
                          ((xfs_bmbt_rec_base_t)startblock << 21) |
                          ((xfs_bmbt_rec_base_t)blockcount &
-                           (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+                           (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
        } else {
                r->l0 = cpu_to_be64(
                        ((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -337,7 +337,7 @@ xfs_bmbt_disk_set_allf(
                r->l1 = cpu_to_be64(
                        ((xfs_bmbt_rec_base_t)startblock << 21) |
                         ((xfs_bmbt_rec_base_t)blockcount &
-                          (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+                          (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
        }
 #endif  /* XFS_BIG_BLKNOS */
 }
@@ -362,9 +362,9 @@ xfs_bmbt_set_blockcount(
        xfs_bmbt_rec_host_t *r,
        xfs_filblks_t   v)
 {
-        ASSERT((v & XFS_MASK64HI(43)) == 0);
+        ASSERT((v & xfs_mask64hi(43)) == 0);
-        r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) |
+        r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
-                  (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21));
+                  (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
 }
 /*
@@ -376,21 +376,21 @@ xfs_bmbt_set_startblock(
        xfs_fsblock_t   v)
 {
 #if XFS_BIG_BLKNOS
-        ASSERT((v & XFS_MASK64HI(12)) == 0);
+        ASSERT((v & xfs_mask64hi(12)) == 0);
-        r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) |
+        r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
                  (xfs_bmbt_rec_base_t)(v >> 43);
-        r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) |
+        r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
                  (xfs_bmbt_rec_base_t)(v << 21);
 #else   /* !XFS_BIG_BLKNOS */
-        if (ISNULLSTARTBLOCK(v)) {
+        if (isnullstartblock(v)) {
-                r->l0 |= (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+                r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-                r->l1 = (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) |
+                r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
                          ((xfs_bmbt_rec_base_t)v << 21) |
-                          (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+                          (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
        } else {
-                r->l0 &= ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+                r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
                r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
-                          (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+                          (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
        }
 #endif  /* XFS_BIG_BLKNOS */
 }
@@ -403,10 +403,10 @@ xfs_bmbt_set_startoff(
        xfs_bmbt_rec_host_t *r,
        xfs_fileoff_t   v)
 {
-        ASSERT((v & XFS_MASK64HI(9)) == 0);
+        ASSERT((v & xfs_mask64hi(9)) == 0);
-        r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) |
+        r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
                ((xfs_bmbt_rec_base_t)v << 9) |
-                  (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+                  (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
 }
 /*
@@ -419,9 +419,9 @@ xfs_bmbt_set_state(
 {
        ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
        if (v == XFS_EXT_NORM)
-                r->l0 &= XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN);
+                r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
        else
-                r->l0 |= XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN);
+                r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
 }
 /*
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index a4555abb6622..0e8df007615e 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -76,26 +76,22 @@ typedef struct xfs_bmbt_rec_host {
 #define DSTARTBLOCKMASK         \
        (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-#define ISNULLSTARTBLOCK(x)     isnullstartblock(x)
 static inline int isnullstartblock(xfs_fsblock_t x)
 {
        return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
 }
-#define ISNULLDSTARTBLOCK(x)    isnulldstartblock(x)
 static inline int isnulldstartblock(xfs_dfsbno_t x)
 {
        return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
 }
-#define NULLSTARTBLOCK(k)       nullstartblock(k)
 static inline xfs_fsblock_t nullstartblock(int k)
 {
        ASSERT(k < (1 << STARTBLOCKVALBITS));
        return STARTBLOCKMASK | (k);
 }
-#define STARTBLOCKVAL(x)        startblockval(x)
 static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
 {
        return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 7ed59267420d..e73c332eb23f 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -730,8 +730,8 @@ xfs_btree_readahead_lblock(
        struct xfs_btree_block  *block)
 {
        int                     rval = 0;
-        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+        xfs_dfsbno_t            left = be64_to_cpu(block->bb_u.l.bb_leftsib);
-        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+        xfs_dfsbno_t            right = be64_to_cpu(block->bb_u.l.bb_rightsib);
        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
                xfs_btree_reada_bufl(cur->bc_mp, left, 1);
@@ -843,7 +843,7 @@ xfs_btree_ptr_is_null(
        union xfs_btree_ptr     *ptr)
 {
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-                return be64_to_cpu(ptr->l) == NULLFSBLOCK;
+                return be64_to_cpu(ptr->l) == NULLDFSBNO;
        else
                return be32_to_cpu(ptr->s) == NULLAGBLOCK;
 }
@@ -854,7 +854,7 @@ xfs_btree_set_ptr_null(
        union xfs_btree_ptr     *ptr)
 {
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-                ptr->l = cpu_to_be64(NULLFSBLOCK);
+                ptr->l = cpu_to_be64(NULLDFSBNO);
        else
                ptr->s = cpu_to_be32(NULLAGBLOCK);
 }
@@ -918,8 +918,8 @@ xfs_btree_init_block(
        new->bb_numrecs = cpu_to_be16(numrecs);
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-                new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+                new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-                new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+                new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
        } else {
                new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
                new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
@@ -960,7 +960,7 @@ xfs_btree_buf_to_ptr(
                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
                                        XFS_BUF_ADDR(bp)));
        else {
-                ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
+                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
                                        XFS_BUF_ADDR(bp)));
        }
 }
@@ -971,7 +971,7 @@ xfs_btree_ptr_to_daddr(
        union xfs_btree_ptr     *ptr)
 {
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-                ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
+                ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO);
                return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
        } else {
@@ -2454,7 +2454,7 @@ xfs_btree_new_iroot(
        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
        *logflags |=
-                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
        *stat = 1;
        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
        return 0;
@@ -3048,7 +3048,7 @@ xfs_btree_kill_iroot(
        cur->bc_bufs[level - 1] = NULL;
        be16_add_cpu(&block->bb_level, -1);
        xfs_trans_log_inode(cur->bc_tp, ip,
-                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
        cur->bc_nlevels--;
 out0:
        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index a11a8390bf6c..c45f74ff1a5b 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1597,7 +1597,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
        nmap = 1;
        ASSERT(args->firstblock != NULL);
        if ((error = xfs_bmapi(tp, dp, bno, count,
-                        XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
+                        xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
                        XFS_BMAPI_CONTIG,
                        args->firstblock, args->total, &map, &nmap,
                        args->flist, NULL))) {
@@ -1618,7 +1618,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
                        nmap = MIN(XFS_BMAP_MAX_NMAP, count);
                        c = (int)(bno + count - b);
                        if ((error = xfs_bmapi(tp, dp, b, c,
-                                        XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
+                                        xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
                                        XFS_BMAPI_METADATA,
                                        args->firstblock, args->total,
                                        &mapp[mapi], &nmap, args->flist,
@@ -1882,7 +1882,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                 * the last block to the place we want to kill.
                 */
                if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
-                                XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
+                                xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
                                0, args->firstblock, args->flist, NULL,
                                &done)) == ENOSPC) {
                        if (w != XFS_DATA_FORK)
@@ -1987,7 +1987,7 @@ xfs_da_do_buf(
                        if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
                                        nfsb,
                                        XFS_BMAPI_METADATA |
-                                                XFS_BMAPI_AFLAG(whichfork),
+                                                xfs_bmapi_aflag(whichfork),
                                        NULL, 0, mapp, &nmap, NULL, NULL)))
                                goto exit0;
                }
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b4c1ee713492..f8278cfcc1d3 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -55,17 +55,11 @@ xfs_swapext(
        struct file     *file, *target_file;
        int             error = 0;
-        sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
-        if (!sxp) {
-                error = XFS_ERROR(ENOMEM);
-                goto out;
-        }
        /* Pull information for the target fd */
        file = fget((int)sxp->sx_fdtarget);
        if (!file) {
                error = XFS_ERROR(EINVAL);
-                goto out_free_sxp;
+                goto out;
        }
        if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
@@ -109,8 +103,6 @@ xfs_swapext(
        fput(target_file);
 out_put_file:
        fput(file);
- out_free_sxp:
-        kmem_free(sxp);
 out:
        return error;
 }
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e2fa0a1d8e96..e1f0a06aaf04 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -517,9 +517,9 @@ xfs_dir2_block_getdents(
                /*
                 * If it didn't fit, set the final offset to here & return.
                 */
-                if (filldir(dirent, dep->name, dep->namelen, cook,
+                if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
                            ino, DT_UNKNOWN)) {
-                        *offset = cook;
+                        *offset = cook & 0x7fffffff;
                        xfs_da_brelse(NULL, bp);
                        return 0;
                }
@@ -529,7 +529,8 @@ xfs_dir2_block_getdents(
         * Reached the end of the block.
         * Set the offset to a non-existent block 1 and return.
         */
-        *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0);
+        *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+                        0x7fffffff;
        xfs_da_brelse(NULL, bp);
        return 0;
 }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 93535992cb60..ef805a374eec 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1092,7 +1092,7 @@ xfs_dir2_leaf_getdents(
                 * Won't fit.  Return to caller.
                 */
                if (filldir(dirent, dep->name, dep->namelen,
-                            xfs_dir2_byte_to_dataptr(mp, curoff),
+                            xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
                            ino, DT_UNKNOWN))
                        break;
@@ -1108,9 +1108,9 @@ xfs_dir2_leaf_getdents(
         * All done.  Set output offset value to current offset.
         */
        if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
-                *offset = XFS_DIR2_MAX_DATAPTR;
+                *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
        else
-                *offset = xfs_dir2_byte_to_dataptr(mp, curoff);
+                *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
        kmem_free(map);
        if (bp)
                xfs_da_brelse(NULL, bp);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index b46af0013ec9..a8a8a6efad5b 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -752,8 +752,8 @@ xfs_dir2_sf_getdents(
 #if XFS_BIG_INUMS
                ino += mp->m_inoadd;
 #endif
-                if (filldir(dirent, ".", 1, dot_offset, ino, DT_DIR)) {
+                if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
-                        *offset = dot_offset;
+                        *offset = dot_offset & 0x7fffffff;
                        return 0;
                }
        }
@@ -766,8 +766,8 @@ xfs_dir2_sf_getdents(
 #if XFS_BIG_INUMS
                ino += mp->m_inoadd;
 #endif
-                if (filldir(dirent, "..", 2, dotdot_offset, ino, DT_DIR)) {
+                if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
-                        *offset = dotdot_offset;
+                        *offset = dotdot_offset & 0x7fffffff;
                        return 0;
                }
        }
@@ -791,14 +791,15 @@ xfs_dir2_sf_getdents(
 #endif
                if (filldir(dirent, sfep->name, sfep->namelen,
-                                            off, ino, DT_UNKNOWN)) {
+                            off & 0x7fffffff, ino, DT_UNKNOWN)) {
-                        *offset = off;
+                        *offset = off & 0x7fffffff;
                        return 0;
                }
                sfep = xfs_dir2_sf_nextentry(sfp, sfep);
        }
-        *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0);
+        *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+                        0x7fffffff;
        return 0;
 }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 589c41c38446..f7c06fac8229 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -465,8 +465,8 @@ typedef struct xfs_handle {
 #define XFS_IOC_ERROR_INJECTION      _IOW ('X', 116, struct xfs_error_injection)
 #define XFS_IOC_ERROR_CLEARALL       _IOW ('X', 117, struct xfs_error_injection)
 /*      XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118      */
-#define XFS_IOC_FREEZE               _IOWR('X', 119, int)
+/*      XFS_IOC_FREEZE            -- FIFREEZE   119      */
-#define XFS_IOC_THAW                 _IOWR('X', 120, int)
+/*      XFS_IOC_THAW              -- FITHAW     120      */
 #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
 #define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 852b6d32e8d0..680d0e0ec932 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -595,17 +595,19 @@ out:
        return 0;
 }
-void
+int
 xfs_fs_log_dummy(
        xfs_mount_t     *mp)
 {
        xfs_trans_t     *tp;
        xfs_inode_t     *ip;
+        int             error;
        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-        if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
+        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        if (error) {
                xfs_trans_cancel(tp, 0);
-                return;
+                return error;
        }
        ip = mp->m_rootip;
@@ -615,9 +617,10 @@ xfs_fs_log_dummy(
        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        xfs_trans_set_sync(tp);
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        return error;
 }
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 300d0c9d61ad..88435e0a77c9 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern void xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e6ebbaeb4dc6..ab016e5ae7be 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -357,7 +357,7 @@ xfs_ialloc_ag_alloc(
                        int     ioffset = i << args.mp->m_sb.sb_inodelog;
                        uint    isize = sizeof(struct xfs_dinode);
-                        free = XFS_MAKE_IPTR(args.mp, fbuf, i);
+                        free = xfs_make_iptr(args.mp, fbuf, i);
                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
                        free->di_version = version;
                        free->di_gen = cpu_to_be32(gen);
@@ -937,7 +937,7 @@ nextag:
                        }
                }
        }
-        offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
+        offset = xfs_ialloc_find_free(&rec.ir_free);
        ASSERT(offset >= 0);
        ASSERT(offset < XFS_INODES_PER_CHUNK);
        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1279,7 +1279,7 @@ xfs_imap(
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
-                cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
+                cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno);
                offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
                imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 50f558a4e0a8..aeee8278f92c 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -39,7 +39,6 @@ struct xfs_trans;
 /*
 * Make an inode pointer out of the buffer/offset.
 */
-#define XFS_MAKE_IPTR(mp,b,o)           xfs_make_iptr(mp,b,o)
 static inline struct xfs_dinode *
 xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
 {
@@ -50,7 +49,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
 /*
 * Find a free (set) bit in the inode bitmask.
 */
-#define XFS_IALLOC_FIND_FREE(fp)        xfs_ialloc_find_free(fp)
 static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
 {
        return xfs_lowbit64(*fp);
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 37e5dd01a577..5580e255ff06 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -36,7 +36,6 @@ typedef	__uint64_t	xfs_inofree_t;
 #define XFS_INODES_PER_CHUNK_LOG        (XFS_NBBYLOG + 3)
 #define XFS_INOBT_ALL_FREE      ((xfs_inofree_t)-1)
-#define XFS_INOBT_MASKN(i,n)            xfs_inobt_maskn(i,n)
 static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 {
        return (((n) >= XFS_INODES_PER_CHUNK ? \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5a5e035e5d38..e7ae08d1df48 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -424,6 +424,19 @@ xfs_iformat(
        case XFS_DINODE_FMT_LOCAL:
                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
                size = be16_to_cpu(atp->hdr.totsize);
+                if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                                "corrupt inode %Lu "
+                                "(bad attr fork size %Ld).",
+                                (unsigned long long) ip->i_ino,
+                                (long long) size);
+                        XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+                                             XFS_ERRLEVEL_LOW,
+                                             ip->i_mount, dip);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
                error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
                break;
        case XFS_DINODE_FMT_EXTENTS:
@@ -1601,10 +1614,10 @@ xfs_itruncate_finish(
                 * in this file with garbage in them once recovery
                 * runs.
                 */
-                XFS_BMAP_INIT(&free_list, &first_block);
+                xfs_bmap_init(&free_list, &first_block);
                error = xfs_bunmapi(ntp, ip,
                                    first_unmap_block, unmap_len,
-                                    XFS_BMAPI_AFLAG(fork) |
+                                    xfs_bmapi_aflag(fork) |
                                      (sync ? 0 : XFS_BMAPI_ASYNC),
                                    XFS_ITRUNC_MAX_EXTENTS,
                                    &first_block, &free_list,
@@ -2557,7 +2570,7 @@ xfs_iextents_copy(
        for (i = 0; i < nrecs; i++) {
                xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
                start_block = xfs_bmbt_get_startblock(ep);
-                if (ISNULLSTARTBLOCK(start_block)) {
+                if (isnullstartblock(start_block)) {
                        /*
                         * It's a delayed allocation extent, so skip it.
                         */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 1ff04cc323ad..9957d0602d54 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -111,20 +111,16 @@ typedef struct xfs_inode_log_format_64 {
 #define XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
-#define XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
 static inline int xfs_ilog_fbroot(int w)
 {
        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
 }
-#define XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
 static inline int xfs_ilog_fext(int w)
 {
        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
 }
-#define XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
 static inline int xfs_ilog_fdata(int w)
 {
        return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 911062cf73a6..08ce72316bfe 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -155,7 +155,7 @@ xfs_imap_to_bmap(
                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
                        iomapp->iomap_flags |= IOMAP_DELAY;
                } else {
-                        iomapp->iomap_bn = XFS_FSB_TO_DB(ip, start_block);
+                        iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
                        if (ISUNWRITTEN(imap))
                                iomapp->iomap_flags |= IOMAP_UNWRITTEN;
                }
@@ -261,7 +261,7 @@ xfs_iomap(
                xfs_iunlock(ip, lockmode);
                lockmode = 0;
-                if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) {
+                if (nimaps && !isnullstartblock(imap.br_startblock)) {
                        xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip,
                                        offset, count, iomapp, &imap, flags);
                        break;
@@ -491,7 +491,7 @@ xfs_iomap_write_direct(
        /*
         * Issue the xfs_bmapi() call to allocate the blocks
         */
-        XFS_BMAP_INIT(&free_list, &firstfsb);
+        xfs_bmap_init(&free_list, &firstfsb);
        nimaps = 1;
        error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
                &firstfsb, 0, &imap, &nimaps, &free_list, NULL);
@@ -751,7 +751,7 @@ xfs_iomap_write_allocate(
                        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
                        xfs_trans_ihold(tp, ip);
-                        XFS_BMAP_INIT(&free_list, &first_block);
+                        xfs_bmap_init(&free_list, &first_block);
                        /*
                         * it is possible that the extents have changed since
@@ -911,7 +911,7 @@ xfs_iomap_write_unwritten(
                /*
                 * Modify the unwritten extent state of the buffer.
                 */
-                XFS_BMAP_INIT(&free_list, &firstfsb);
+                xfs_bmap_init(&free_list, &firstfsb);
                nimaps = 1;
                error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
                                  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index e19d0a8d5618..cf98a805ec90 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -453,7 +453,7 @@ xfs_bulkstat(
                            (chunkidx = agino - gino + 1) <
                                    XFS_INODES_PER_CHUNK &&
                                        /* there are some left allocated */
-                            XFS_INOBT_MASKN(chunkidx,
+                            xfs_inobt_maskn(chunkidx,
                                    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
                                /*
                                 * Grab the chunk record.  Mark all the
@@ -464,7 +464,7 @@ xfs_bulkstat(
                                        if (XFS_INOBT_MASK(i) & ~gfree)
                                                gcnt++;
                                }
-                                gfree |= XFS_INOBT_MASKN(0, chunkidx);
+                                gfree |= xfs_inobt_maskn(0, chunkidx);
                                irbp->ir_startino = gino;
                                irbp->ir_freecount = gcnt;
                                irbp->ir_free = gfree;
@@ -535,7 +535,7 @@ xfs_bulkstat(
                                     chunkidx < XFS_INODES_PER_CHUNK;
                                     chunkidx += nicluster,
                                     agbno += nbcluster) {
-                                        if (XFS_INOBT_MASKN(chunkidx,
+                                        if (xfs_inobt_maskn(chunkidx,
                                                            nicluster) & ~gfree)
                                                xfs_btree_reada_bufs(mp, agno,
                                                        agbno, nbcluster);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 35cca98bd94c..b1047de2fffd 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -70,16 +70,21 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 xfs_buf_t *
 xlog_get_bp(
        xlog_t          *log,
-        int             num_bblks)
+        int             nbblks)
 {
-        ASSERT(num_bblks > 0);
+        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                XFS_ERROR_REPORT("xlog_get_bp(1)",
+                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                return NULL;
+        }
        if (log->l_sectbb_log) {
-                if (num_bblks > 1)
+                if (nbblks > 1)
-                        num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+                        nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
-                num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
+                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
        }
-        return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
+        return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
 void
@@ -102,6 +107,13 @@ xlog_bread(
 {
        int             error;
+        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                XFS_ERROR_REPORT("xlog_bread(1)",
+                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                return EFSCORRUPTED;
+        }
        if (log->l_sectbb_log) {
                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -139,6 +151,13 @@ xlog_bwrite(
 {
        int             error;
+        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                XFS_ERROR_REPORT("xlog_bwrite(1)",
+                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                return EFSCORRUPTED;
+        }
        if (log->l_sectbb_log) {
                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3c97c6463a4e..35300250e86d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
-STATIC int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int      xfs_uuid_mount(xfs_mount_t *);
 STATIC void     xfs_unmountfs_wait(xfs_mount_t *);
@@ -682,7 +681,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
 * Update alignment values based on mount options and sb values
 */
 STATIC int
-xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
+xfs_update_alignment(xfs_mount_t *mp)
 {
        xfs_sb_t        *sbp = &(mp->m_sb);
@@ -736,11 +735,11 @@ xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
                if (xfs_sb_version_hasdalign(sbp)) {
                        if (sbp->sb_unit != mp->m_dalign) {
                                sbp->sb_unit = mp->m_dalign;
-                                *update_flags |= XFS_SB_UNIT;
+                                mp->m_update_flags |= XFS_SB_UNIT;
                        }
                        if (sbp->sb_width != mp->m_swidth) {
                                sbp->sb_width = mp->m_swidth;
-                                *update_flags |= XFS_SB_WIDTH;
+                                mp->m_update_flags |= XFS_SB_WIDTH;
                        }
                }
        } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
@@ -905,7 +904,6 @@ xfs_mountfs(
        xfs_sb_t        *sbp = &(mp->m_sb);
        xfs_inode_t     *rip;
        __uint64_t      resblks;
-        __int64_t       update_flags = 0LL;
        uint            quotamount, quotaflags;
        int             uuid_mounted = 0;
        int             error = 0;
@@ -933,7 +931,7 @@ xfs_mountfs(
                        "XFS: correcting sb_features alignment problem");
                sbp->sb_features2 |= sbp->sb_bad_features2;
                sbp->sb_bad_features2 = sbp->sb_features2;
-                update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
+                mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
                /*
                 * Re-check for ATTR2 in case it was found in bad_features2
@@ -947,11 +945,11 @@ xfs_mountfs(
        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
           (mp->m_flags & XFS_MOUNT_NOATTR2)) {
                xfs_sb_version_removeattr2(&mp->m_sb);
-                update_flags |= XFS_SB_FEATURES2;
+                mp->m_update_flags |= XFS_SB_FEATURES2;
                /* update sb_versionnum for the clearing of the morebits */
                if (!sbp->sb_features2)
-                        update_flags |= XFS_SB_VERSIONNUM;
+                        mp->m_update_flags |= XFS_SB_VERSIONNUM;
        }
        /*
@@ -960,7 +958,7 @@ xfs_mountfs(
         * allocator alignment is within an ag, therefore ag has
         * to be aligned at stripe boundary.
         */
-        error = xfs_update_alignment(mp, &update_flags);
+        error = xfs_update_alignment(mp);
        if (error)
                goto error1;
@@ -1137,10 +1135,12 @@ xfs_mountfs(
        }
        /*
-         * If fs is not mounted readonly, then update the superblock changes.
+         * If this is a read-only mount defer the superblock updates until
+         * the next remount into writeable mode.  Otherwise we would never
+         * perform the update e.g. for the root filesystem.
         */
-        if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+        if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                error = xfs_mount_log_sb(mp, update_flags);
+                error = xfs_mount_log_sb(mp, mp->m_update_flags);
                if (error) {
                        cmn_err(CE_WARN, "XFS: failed to write sb changes");
                        goto error4;
@@ -1820,7 +1820,7 @@ xfs_uuid_mount(
 * be altered by the mount options, as well as any potential sb_features2
 * fixup. Only the first superblock is updated.
 */
-STATIC int
+int
 xfs_mount_log_sb(
        xfs_mount_t     *mp,
        __int64_t       fields)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index c1e028467327..f5e9937f9bdb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -44,9 +44,9 @@ typedef struct xfs_trans_reservations {
 #ifndef __KERNEL__
-#define XFS_DADDR_TO_AGNO(mp,d) \
+#define xfs_daddr_to_agno(mp,d) \
        ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
-#define XFS_DADDR_TO_AGBNO(mp,d) \
+#define xfs_daddr_to_agbno(mp,d) \
        ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
 #else /* __KERNEL__ */
@@ -327,6 +327,8 @@ typedef struct xfs_mount {
        spinlock_t              m_sync_lock;    /* work item list lock */
        int                     m_sync_seq;     /* sync thread generation no. */
        wait_queue_head_t       m_wait_single_sync_task;
+        __int64_t               m_update_flags; /* sb flags we need to update
+                                                   on the next remount,rw */
 } xfs_mount_t;
 /*
@@ -439,7 +441,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 */
 #define XFS_MFSI_QUIET          0x40    /* Be silent if mount errors found */
-#define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
 static inline xfs_agnumber_t
 xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
 {
@@ -448,7 +449,6 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
        return (xfs_agnumber_t) ld;
 }
-#define XFS_DADDR_TO_AGBNO(mp,d)        xfs_daddr_to_agbno(mp,d)
 static inline xfs_agblock_t
 xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 {
@@ -514,6 +514,7 @@ extern int	xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
                        int64_t, int);
 extern int      xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
                        uint, int);
+extern int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
 extern void     xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 86471bb40fd4..58f85e9cd11d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -147,7 +147,7 @@ xfs_rename(
        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
                                inodes, &num_inodes);
-        XFS_BMAP_INIT(&free_list, &first_block);
+        xfs_bmap_init(&free_list, &first_block);
        tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index edf12c7b834c..c5bb86f3ec05 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -120,7 +120,7 @@ xfs_growfs_rt_alloc(
                if ((error = xfs_trans_iget(mp, tp, ino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
                        goto error_cancel;
-                XFS_BMAP_INIT(&flist, &firstblock);
+                xfs_bmap_init(&flist, &firstblock);
                /*
                 * Allocate blocks to the bitmap file.
                 */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f87db5344ce6..f76c003ec55d 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -28,7 +28,6 @@ struct xfs_mount;
 * file is a real time file or not, because the bmap code
 * does.
 */
-#define XFS_FSB_TO_DB(ip,fsb)   xfs_fsb_to_db(ip,fsb)
 static inline xfs_daddr_t
 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1ed71916e4c9..1b017c657494 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -505,7 +505,7 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
 #define XFS_HDR_BLOCK(mp,d)     ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
 #define XFS_DADDR_TO_FSB(mp,d)  XFS_AGB_TO_FSB(mp, \
-                        XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d))
+                        xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
 #define XFS_FSB_TO_DADDR(mp,fsbno)      XFS_AGB_TO_DADDR(mp, \
                        XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 0f5191644ab2..b2f724502f1b 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -45,7 +45,7 @@ typedef __uint32_t		prid_t;		/* project ID */
 typedef __uint32_t              inst_t;         /* an instruction */
 typedef __s64                   xfs_off_t;      /* <file offset> type */
-typedef __u64                   xfs_ino_t;      /* <inode> type */
+typedef unsigned long long      xfs_ino_t;      /* <inode> type */
 typedef __s64                   xfs_daddr_t;    /* <disk address> type */
 typedef char *                  xfs_caddr_t;    /* <core address> type */
 typedef __u32                   xfs_dev_t;
@@ -111,8 +111,6 @@ typedef __uint64_t	xfs_fileoff_t;	/* block number in a file */
 typedef __int64_t       xfs_sfiloff_t;  /* signed block number in a file */
 typedef __uint64_t      xfs_filblks_t;  /* number of blocks in a file */
-typedef __uint8_t       xfs_arch_t;     /* architecture of an xfs fs */
 /*
 * Null values for the types.
 */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f07bf8768c3a..0e55c5d7db5f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -862,7 +862,7 @@ xfs_inactive_symlink_rmt(
         * Find the block(s) so we can inval and unmap them.
         */
        done = 0;
-        XFS_BMAP_INIT(&free_list, &first_block);
+        xfs_bmap_init(&free_list, &first_block);
        nmaps = ARRAY_SIZE(mval);
        if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
                        XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
@@ -1288,7 +1288,7 @@ xfs_inactive(
        /*
         * Free the inode.
         */
-        XFS_BMAP_INIT(&free_list, &first_block);
+        xfs_bmap_init(&free_list, &first_block);
        error = xfs_ifree(tp, ip, &free_list);
        if (error) {
                /*
@@ -1461,7 +1461,7 @@ xfs_create(
        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
        unlock_dp_on_error = B_TRUE;
-        XFS_BMAP_INIT(&free_list, &first_block);
+        xfs_bmap_init(&free_list, &first_block);
        ASSERT(ip == NULL);
@@ -1879,7 +1879,7 @@ xfs_remove(
                }
        }
-        XFS_BMAP_INIT(&free_list, &first_block);
+        xfs_bmap_init(&free_list, &first_block);
        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error) {
@@ -2059,7 +2059,7 @@ xfs_link(
        if (error)
                goto error_return;
-        XFS_BMAP_INIT(&free_list, &first_block);
+        xfs_bmap_init(&free_list, &first_block);
        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
                                        &first_block, &free_list, resblks);
@@ -2231,7 +2231,7 @@ xfs_mkdir(
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
        unlock_dp_on_error = B_FALSE;
-        XFS_BMAP_INIT(&free_list, &first_block);
+        xfs_bmap_init(&free_list, &first_block);
        error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
                                        &first_block, &free_list, resblks ?
@@ -2438,7 +2438,7 @@ xfs_symlink(
         * Initialize the bmap freelist prior to calling either
         * bmapi or the directory create code.
         */
-        XFS_BMAP_INIT(&free_list, &first_block);
+        xfs_bmap_init(&free_list, &first_block);
        /*
         * Allocate an inode for the symlink.
@@ -2860,7 +2860,7 @@ retry:
                /*
                 * Issue the xfs_bmapi() call to allocate the blocks
                 */
-                XFS_BMAP_INIT(&free_list, &firstfsb);
+                xfs_bmap_init(&free_list, &firstfsb);
                error = xfs_bmapi(tp, ip, startoffset_fsb,
                                  allocatesize_fsb, bmapi_flag,
                                  &firstfsb, 0, imapp, &nimaps,
@@ -2980,7 +2980,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_UNDONE(bp);
                XFS_BUF_UNWRITE(bp);
                XFS_BUF_READ(bp);
-                XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
+                XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
                xfsbdstrat(mp, bp);
                error = xfs_iowait(bp);
                if (error) {
@@ -3186,7 +3186,7 @@ xfs_free_file_space(
                /*
                 * issue the bunmapi() call to free the blocks
                 */
-                XFS_BMAP_INIT(&free_list, &firstfsb);
+                xfs_bmap_init(&free_list, &firstfsb);
                error = xfs_bunmapi(tp, ip, startoffset_fsb,
                                  endoffset_fsb - startoffset_fsb,
                                  0, 2, &firstfsb, &free_list, NULL, &done);
author	James Morris <jmorris@namei.org>	2009-02-05 19:01:45 -0500
committer	James Morris <jmorris@namei.org>	2009-02-05 19:01:45 -0500
commit	cb5629b10d64a8006622ce3a52bc887d91057d69 (patch)
tree	7c06d8f30783115e3384721046258ce615b129c5 /fs
parent	8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff)
parent	f01d1d546abb2f4028b5299092f529eefb01253a (diff)